setwd("C:/Users/20621066W/Desktop/T2DM")
library(ggplot2)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyr)
library(patchwork)
library(purrr)
library(uwot)
## Loading required package: Matrix
## 
## Attaching package: 'Matrix'
## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack
library(gridExtra)
## 
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
## 
##     combine
library(ClusterR)
## Loading required package: gtools
library(splines)
library(flextable)
## 
## Attaching package: 'flextable'
## The following object is masked from 'package:purrr':
## 
##     compose
library(archetypes)
## Loading required package: modeltools
## Loading required package: stats4
## Loading required package: nnls
library(survival)
library(survminer)
## Loading required package: ggpubr
## 
## Attaching package: 'ggpubr'
## The following objects are masked from 'package:flextable':
## 
##     border, font, rotate
## 
## Attaching package: 'survminer'
## The following object is masked from 'package:survival':
## 
##     myeloma
library(ggfortify)
library(ranger)
library(stats)
library(broom)

library("viridis")
## Loading required package: viridisLite
library(devtools)
## Loading required package: usethis

Preparación de los datos

# Carreguem les dades

load("bbdd_covar_T2DM.RData")
bbdd_covar <- bbdd_covar_T2DM
bbdd_covar <- bbdd_covar %>%
  mutate(obesity_BMI = as.numeric(obesity == 1 | 30 <= BMI),
         obesity_BMI = if_else(is.na(obesity_BMI), 0, obesity_BMI),
         sex_female = factor(sex_female, labels = c('Men', 'Women')),
         obesity_BMI = factor(obesity_BMI, labels = c('No obesity', 'Obesity')),
         TimeT2DM = TimeT2DM/365.25) %>% 
  mutate_at(.vars = c('A10', 'A10A', 'A10B', 'C01', 'C02', 'C03', 'C07', 'C08', 'C09', 'C10', 'M01A'),
            function(x) if_else(is.na(x), 0, x))

Descriptiva de las variables

desc_bbdd <- function(abbdd){
  n_sex <- abbdd %>%
    count(sex = sex_female)
  abbdd %>%
    group_by(sex_female) %>% 
    summarise(name = 'total',
             N = as.character(n()),
             resum = as.character(n())) %>%
    bind_rows(abbdd %>%
                select(rowId, sex_female, Current, Former, obesity_BMI, A10A, A10B, C02, C03, C07, C08,
                       C09, C10, M01A, angor, ami, stroke, tia) %>%
                mutate_if(is.numeric, as.character) %>% 
                # mutate(Current = as.character(Current),
                #        Former = as.character(Former),
                #        obesity_BMI = as.character(obesity_BMI)) %>%
                pivot_longer(cols = !contains(c('rowId', 'sex_female'))) %>%
                group_by(sex_female, name, value) %>%
                summarise(N = n(),
                          .groups = 'keep') %>%
                group_by(sex_female, name) %>%
                transmute(sex_female,
                          name,
                          value,
                          resum = sprintf('%i (%2.1f%%)', N, N/sum(N)*100)) %>%
                filter(value != 0)) %>% 
    bind_rows(abbdd %>%
                select(-Current, -obesity_BMI, -Former, -A10, -A10A, -A10B,
                       -C02, -C03, -C07, -C08, -C09, -C10, -M01A, -angor, -ami, -stroke, -tia) %>% 
                pivot_longer(cols = !contains(c('rowId', 'sex_female'))) %>%
                group_by(sex_female, name) %>%
                summarise(N = sum(!is.na(value)),
                          mn = mean(value, na.rm = T),
                          sd = sd(value, na.rm = T),
                          .groups = 'keep') %>%
                transmute(sex_female,
                          name, 
                          N = sprintf('%i (%2.1f%%)', N, N/n_sex$n[n_sex$sex == sex_female]*100),
                          resum = sprintf('%2.1f (%2.1f)', mn, sd))) %>%
    filter(name != 'NA') %>% 
    select(sex_female, name, value, N, resum) %>%
    pivot_wider(names_from = sex_female,
                values_from = c('N', 'resum')) %>% 
    flextable %>%
    autofit
}
#desc_bbdd(abbdd = bbdd_covar %>% select(-obesity, -T2DM))

Distribució variables numèriques

ggplot(bbdd_covar %>%
         select(rowId, age, BMI, HbA1c, Leukocytes, Monocytes, cLDL, Tg, cHDL, DBP, SBP, Glucose,
                TimeT2DM, Ferritin, sex_female, obesity_BMI) %>%
         pivot_longer(-c(rowId, sex_female, obesity_BMI)) %>%
         mutate(name = factor(name,
                              levels = c('age', 'BMI', 'HbA1c', 'Glucose', 'Leukocytes', 'Monocytes',
                                         'cLDL', 'cHDL', 'Tg', 'DBP', 'SBP', 'TimeT2DM', 'Ferritin'))),
       aes(value)) +
  geom_histogram(bins = 50) +
  facet_grid(sex_female ~ name, scales = "free") +
  theme_bw()
## Warning: Removed 59938 rows containing non-finite values (stat_bin).

Eliminem els valors atípics

Eliminem els valors superiors a mn +/- 5*SD

remove_outliers <- function(x){
  m <- mean(x, na.rm = TRUE)
  s <- sd(x, na.rm = TRUE)
  upperbound <- m + (5*s)
  lowerbound <- m - (5*s)
  ifelse((x > lowerbound) & (x < upperbound), x, NaN)
}
bbdd_covar <- bbdd_covar %>%
  group_by(sex_female) %>% 
  mutate(across(c('HbA1c', 'Leukocytes', 'Monocytes', 'cLDL', 'cHDL', 'Tg', 'DBP', 'SBP', 'Glucose',
                  'TimeT2DM', 'Ferritin'),
                remove_outliers)) %>%
  ungroup
  
ind_cc <- complete.cases(bbdd_covar %>%
                           select(rowId, age, HbA1c, BMI, TimeT2DM, Ferritin,
                                  Leukocytes, Monocytes, cLDL, cHDL, Tg, DBP, SBP, Glucose))
#, HbA1c
bbdd <- bbdd_covar[ind_cc,] %>%
  select(rowId, age, BMI, HbA1c, Leukocytes, Monocytes, cLDL, cHDL, Tg, DBP, SBP, Glucose,
         TimeT2DM, sex_female, obesity_BMI, Current, Former, Ferritin, 
         ami, angor, stroke, tia,
         A10, A10A, A10B, C01, C02, C03, C07, C08, C09, C10, M01A)
desc_bbdd(abbdd = bbdd)
ggplot(bbdd %>%
         select(rowId, age, BMI, HbA1c, Leukocytes, Monocytes, cLDL, Tg, cHDL, DBP, SBP, Glucose,
                TimeT2DM, Ferritin,
                sex_female, obesity_BMI) %>% 
         pivot_longer(-c(rowId, sex_female, obesity_BMI)) %>%
         mutate(name = factor(name,
                              levels = c('age', 'BMI', 'HbA1c', 'Glucose', 'Leukocytes', 'Monocytes',
                                         'cLDL', 'cHDL', 'Tg', 'DBP', 'SBP', 'TimeT2DM', 'Ferritin'))),
       aes(value)) +
  geom_histogram(bins = 50) +
  facet_grid(sex_female ~ name, scales = "free") +
  theme_bw()

Correlacions

bbdd %>%
  select(rowId, age, BMI, HbA1c, Leukocytes, Monocytes, cLDL, Tg, cHDL, DBP, SBP, Glucose, TimeT2DM,
         Ferritin,sex_female) %>%
  split(f = .$sex_female) %>% 
  map_dfr(~{.x %>% 
      select(-rowId, -sex_female) %>% 
      cor %>%
      reshape2::melt()},
      .id = "sex") %>% 
  mutate(rsq = value^2, 
           to_mark = ifelse(Var1 != Var2 & rsq > .25, "*", NA),
           across(c(Var1, Var2), toupper)) %>%
    ggplot(aes(Var1, Var2, fill = value)) +
    geom_tile() +
    geom_text(aes(label = to_mark), na.rm = TRUE) +
    scale_fill_gradient2(low = "blue", high = "red", mid = "white", midpoint = 0, limit = c(-1,1)) +
    theme_minimal() +
    facet_wrap(~sex, nrow = 1) +
    labs(x = NULL, y = NULL, fill = "Correlation", caption = "* Squared correlation higher than 0.25")

UMAP with residualized

bbdd_resid <- bbdd %>%
  mutate(HTA_med = as.numeric(C02 | C03 | C07 | C08 | C09)) %>%
  split(f = .$sex_female) %>%
  map(~{
    .x %>%
    transmute(rowId,
              across(c("HbA1c", "Glucose", "Leukocytes", "Monocytes", "cLDL", "cHDL", "Tg",
                       "DBP", "SBP", 'TimeT2DM', 'Ferritin'),
                     function(v,...){
                       dd <- data.frame(vcol = v, a = age, b = BMI, c = Current,
                                        d = Former, a10a = A10A, a10b = A10B, f = C10, g = HTA_med)
                       knots_age <- quantile(x = dd$age,
                                             probs = c(0.2, 0.4, 0.6, 0.8))
                       knots_BMI <- quantile(x = dd$BMI,
                                             probs = c(0.2, 0.4, 0.6, 0.8))
                       lm(vcol ~ bs(a, knots = knots_age) + bs(b, knots = knots_BMI) +
                            c + a10a + a10b + f + g, data = dd) %>%
                         resid %>% scale %>% as.vector
                     }))
  })


ggplot(bbdd_resid %>%
         bind_rows(.id = "sex_female") %>%
         pivot_longer(-c(rowId, sex_female)) %>%
         mutate(name = factor(name,
                              levels = c('age', 'BMI', 'HbA1c', 'Glucose', 'Leukocytes', 'Monocytes',
                                         'cLDL', 'cHDL', 'Tg', 'DBP', 'SBP', 'TimeT2DM', 'Ferritin'))),
       aes(value)) +
  geom_histogram(bins = 50) +
  facet_grid(sex_female ~ name, scales = "free") +
  theme_bw()

bbdd_resid_saved <- bbdd_resid

Correlations

bbdd_resid %>%
  map_dfr(~{.x %>% 
      select(-rowId) %>% 
      cor %>%
      reshape2::melt()},
      .id = "sex") %>% 
  mutate(rsq = value^2, 
           to_mark = ifelse(Var1 != Var2 & rsq > .25, "*", NA),
           across(c(Var1, Var2), toupper)) %>%
    ggplot(aes(Var1, Var2, fill = value)) +
    geom_tile() +
    geom_text(aes(label = to_mark), na.rm = TRUE) +
    scale_fill_gradient2(low = "blue", high = "red", mid = "white", midpoint = 0, limit = c(-1,1)) +
    theme_minimal() +
    facet_wrap(~sex, nrow = 1) +
    labs(x = NULL, y = NULL, fill = "Correlation", caption = "* Squared correlation higher than 0.25")

UMAP

Calculem l’UMAP sense estandarditzar. Utilitzem la mètrica Manhattan.

umap_resid <- bbdd_resid %>%
  map(~{
    n_total <- nrow(.x)
    nn <- round(10 + 15 * (log10(n_total) - 3))
    # print(names(.x))
    umap(X = .x %>% select(-rowId),
         n_neighbors = nn,
         min_dist = 0,
         n_components = 2, 
         nn_method = "annoy",
         approx_pow = TRUE,
         n_sgd_threads = "auto", 
         binary_edge_weights = TRUE,
         dens_scale = .5, 
         ret_extra = c("model", "nn", "fgraph"))
  })
umap_embed <- bbdd_resid %>%
  map2(umap_resid, ~{
    dat1 <- tibble(rowId = .x$rowId)
    dat2 <- data.frame(.y$embedding)
    bind_cols(dat1, dat2)
  })
umap_embed %>%
    bind_rows(.id = "sex") %>%
    ggplot(aes(X1, X2)) +
    geom_point(alpha = .1) +
    facet_wrap(~sex, nrow = 1) +
    theme_bw() +
    labs(x = "UMAP1", y = "UMAP2")

umap_embed %>%
    bind_rows(.id = "sex") %>%
    ggplot(aes(X1, X2)) +
    geom_density_2d_filled() +
    facet_wrap(~sex, nrow = 1) +
    theme_bw() +
    theme(legend.position = "none") +
    labs(x = "UMAP1", y = "UMAP2")

umap_embed_saved <- umap_embed

Overlaying input variables

umap_embed %>%
  bind_rows(.id = "sex_female") %>%
  {
    d <- bbdd %>%
      select(rowId, sex_female, age, BMI)
    inner_join(., d, by = c("rowId", "sex_female"))
  } %>%
  pivot_longer(-c(X1, X2, sex_female, rowId)) %>%
  split(f = .$name) %>%
  imap(~{
    mp <- median(.x$value)
    .x %>%
      ggplot(aes(X1, X2)) +
      geom_point(aes(color = value)) +
      scale_color_gradient2(midpoint = mp) +
      facet_wrap(~sex_female, nrow = 1) +
      theme_bw() +
      labs(x = "UMAP1", y = "UMAP2", 
           title = ifelse(.y == "bmi", "BMI", stringr::str_to_title(.y)), 
           color = NULL)
  }) %>%
  wrap_plots(ncol = 1)

umap_embed %>%
  bind_rows(.id = "sex_female") %>%
  {
    d <- bbdd %>%
      mutate(HTA_med = as.numeric(C02 | C03 | C07 | C08 | C09),
             obesity_BMI = as.numeric(obesity_BMI == 'Obesity')) %>%
      select(rowId, sex_female, obesity_BMI, Current, Former, ami, angor, stroke, tia,
             A10A, A10B, HTA_med, C10, M01A)
    inner_join(., d, by = c("rowId", "sex_female"))
  } %>%
  pivot_longer(-c(X1, X2, sex_female, rowId)) %>%
  mutate(name = toupper(name),
         value = as.factor(value)) %>% 
  ggplot(aes(X1, X2)) +
  geom_point(aes(color = value)) +
  facet_grid(sex_female + value ~ name) +
  theme_bw() +
  theme(panel.grid = element_blank(),
        legend.position = "top") +
  labs(x = NULL,
       y = NULL,
       color = "Value")

umap_embed %>%
  map2_dfr(bbdd_resid,
           inner_join,
           by = "rowId",
           .id = "sex_female") %>%
  tidyr::pivot_longer(-c(X1, X2, sex_female, rowId)) %>%
  mutate(name = toupper(name)) %>% #, 
  ggplot(aes(X1, X2)) +
  geom_point(aes(color = value)) +
  scale_color_gradient2(limits = c(-10, 10), low = scales::muted("blue"), high = scales::muted("red")) +
  facet_grid(sex_female ~ name) +
  theme_bw() +
  theme(panel.grid = element_blank(),
        legend.position = "top") +
  labs(x = NULL,
       y = NULL,
       color = "Value")

Correlation between UMAP axes and input variables

umap_embed %>%
  map2_dfr(bbdd_resid,
           inner_join,
           by = "rowId",
           .id = "sex_female") %>%
  {
    d <- bbdd %>%                                                                                                     
      select(rowId, sex_female, age, BMI)
    inner_join(., d, by = c("rowId", "sex_female"))
  } %>%
  tidyr::pivot_longer(-c(sex_female, rowId, X1, X2)) %>%
  group_by(sex_female, name) %>%
  group_modify(~{
    .x %>%
      with(data.frame(umap_axis = c("UMAP1", "UMAP2"),
                      Correlation = c(cor(X1, value), cor(X2, value))))
  }) %>%
  mutate(name = toupper(name)) %>%
  group_by(sex_female, umap_axis) %>%
  group_map(~{.x %>%
      ggplot(aes(Correlation, reorder(name, abs(Correlation)))) +
      geom_vline(xintercept = 0, lty = "dashed") +
      geom_segment(aes(xend = 0, yend = reorder(name, abs(Correlation)))) +
      geom_point(color = "red") +
      theme_bw() +
      labs(x = NULL, y = NULL, title = paste(.y$sex_female, "-", .y$umap_axis))
  }) %>%
  wrap_plots

Archetypes

umap_fgraphs <- umap_resid %>%
  map2(bbdd_resid,
       ~`dimnames<-`(.x$fgraph, list(.y$rowId, .y$rowId))) %>%
  map(~igraph::graph_from_adjacency_matrix(.x, mode = "undirected"))
eigen_res <- umap_fgraphs %>%
  map(~igraph::cluster_leading_eigen(.x))
eigen_clus <- eigen_res %>%
  map(igraph::membership) %>%
  map(~data.frame(rowId = as.numeric(names(.x)), cluster = as.numeric(.x)))
evcent_dat <- eigen_clus %>%
  imap(~.x %>%
         split(f = .$cluster) %>%
         map_dfr(function(CLUS){
           igraph::subgraph(umap_fgraphs[[.y]], as.character(CLUS$rowId)) %>%
             igraph::evcent() %>%
             pluck("vector") %>%
             data.frame %>%
             tibble::rownames_to_column() %>%
             setNames(c("rowId", "evcent_score")) %>%
             mutate(rowId = as.numeric(rowId))
          },
          .id = "cluster") %>%
         mutate(cluster = as.numeric(cluster)))
repr_ind <- evcent_dat %>%
  map(~.x %>%
        group_by(cluster) %>%
        slice_max(evcent_score) %>%
        ungroup)
arch_mod <- bbdd_resid %>%
  imap(~{mat <- select(.x, -rowId)
  mat <- as.matrix(mat)
  initpoints <- which(.x$rowId %in% repr_ind[[.y]]$rowId)
  res <- archetypes::archetypes(
                mat, 
                k = length(initpoints), 
                family = archetypes::archetypesFamily(initfn = archetypes:::make.fix.initfn(initpoints)),
                verbose = FALSE,
                saveHistory = FALSE)
  return(res)
})
archdat <- arch_mod %>%
  map(~data.frame(.x$archetypes), .id = "sex_female")
archdat %>%
  map_dfr(~{.x %>%
      mutate(arch_number = factor(row_number())) %>%
      pivot_longer(-arch_number)
    },
    .id = "sex_female") %>%
  mutate(name = toupper(name)) %>%
  ggplot(aes(value, arch_number)) +
  geom_vline(xintercept = 0) +
  geom_segment(aes(xend = 0, yend = arch_number)) +
  geom_point() +
  facet_grid(sex_female ~ name, scales = "free_y") +
  labs(x = NULL, y = NULL) +
  theme_bw()

dic_women <- matrix(ncol=2, nrow=nrow(archdat$Women))
colnames(dic_women) <- c("archnum", "archnam")
for (x in 1: nrow(dic_women)){
  dic_women[x,1] = x
  dic_women[x,2] = paste("Cluster", x, sep = " ")
}

dic_men <- matrix(ncol=2, nrow=nrow(archdat$Men))
colnames(dic_men) <- c("archnum", "archnam")
for (x in 1: nrow(dic_men)){
  dic_men[x,1] = x
  dic_men[x,2] = paste("Cluster", x, sep = " ")
}

arch_dict <- list(
    Women = as_tibble(dic_women),
    Men = as_tibble(dic_men))

arch_labs <- archdat %>%
  imap(~{bind_cols(arch_dict[[.y]], .x)})

archdat_umap <- map(
  setNames(c("Men", "Women"), c("Men", "Women")), ~{
    d <- select(arch_labs[[.x]], -c(archnum, archnam))
    res <- umap_transform(d, umap_resid[[.x]])
    res <- data.frame(res)
    bind_cols(select(arch_labs[[.x]], archnam), res)
  })
map(c("Men", "Women"), ~{
  umap_embed[[.x]] %>%
    ggplot(aes(X1, X2)) +
    geom_point(alpha = .1) +
    geom_point(data = archdat_umap[[.x]], aes(fill = archnam), shape = 23, size = 3) +
    theme_bw() +
    labs(x = "UMAP1", y = "UMAP2", title = .x, fill = "Archetypes")
  }) %>%
  wrap_plots(nrow = 1)

arch_alphas <- arch_mod %>%
  map2(arch_labs, ~{
    data.frame(.x$alphas) %>%
      setNames(.y$archnam)
    }) %>%
  map2(bbdd_resid, ~{
    .y %>%
      select(rowId) %>%
      bind_cols(.x)
  })
arch_alphas_long <- arch_alphas %>%
    map(~pivot_longer(.x, -rowId))
arch_alphas_long %>%
  bind_rows(.id = "sex_female") %>%
  ggplot(aes(name, value)) +
  geom_boxplot(aes(group = name, fill = name)) +
  theme_bw() +
  theme(legend.position = "none") +
  facet_wrap(~sex_female, ncol = 1) +
  labs(x = NULL, y = "Probability")

arch_maxprob <- arch_alphas_long %>%
  map(~{.x %>%
      group_by(rowId) %>%
      slice_max(value, with_ties = FALSE) %>%
      ungroup
    })
arch_maxprob %>%
  imap(~{.x %>%
      inner_join(umap_embed[[.y]], by = "rowId")
    }) %>%
  bind_rows(.id = "sex_female") %>%
  ggplot(aes(X1, X2)) +
  geom_point(aes(color = name, alpha = value)) +
  guides(color = guide_legend(override.aes = list(shape = 19))) +
  scale_alpha_identity() +
  facet_wrap(~sex_female, nrow = 1) +
  theme_bw() +
  labs(x = "UMAP1", y = "UMAP2", color = "Archetypes")

arch_maxprob %>%
  imap(~{d <- umap_embed[[.y]]
  .x %>%
    split(f = .$name) %>%
    map(function(ARCH){
      dd <- inner_join(ARCH, d, by = "rowId")
      d %>%
        ggplot(aes(X1, X2)) +
        geom_point(alpha = .1) +
        geom_point(data = dd, color = "red", alpha = .5, size = .2) +
        theme_bw() +
        labs(x = NULL, y = NULL, title = ARCH$name[1])
      }) %>%
    modify_at(1, function(g){ g + labs(y = .y) }) %>%
    wrap_plots(nrow = 1)
  }) %>%
  wrap_plots(nrow = 2)

arch_maxprob %>%
  imap(~{.x %>%
      filter(value > .5) %>%
      inner_join(umap_embed[[.y]], by = "rowId")
    }) %>%
  bind_rows(.id = "sex_female") %>%
  ggplot(aes(X1, X2)) +
  geom_point(aes(color = name, alpha = value)) +
  guides(color = guide_legend(override.aes = list(shape = 19))) +
  scale_alpha_identity() +
  facet_wrap(~sex_female, nrow = 1) +
  theme_bw() +
  labs(x = "UMAP1", y = "UMAP2", color = "Archetypes")

diseases <- bbdd_covar %>% select(rowId, ami, angor, stroke, tia)

umap_disease <- umap_resid %>%
    map(pluck, "embedding") %>%
    map(data.frame) %>%
    map2_dfr(bbdd_resid, bind_cols, .id = "sex") %>%
    select(sex, rowId, X1, X2) %>%
    left_join(diseases, by = "rowId")
head(umap_disease)
##   sex rowId         X1         X2 ami angor stroke tia
## 1 Men  3785  0.1613661  0.4415029   0     0      0   0
## 2 Men  4678  0.5333863 -0.6962384   0     0      0   0
## 3 Men  6049  0.9120204  0.2245933   0     0      0   0
## 4 Men 19645 -0.9482439  1.9006338   0     0      1   0
## 5 Men 25515 -0.7372368 -0.8155515   1     0      0   0
## 6 Men 25754  1.8516054  0.5274401   0     0      0   0
options(repr.plot.width = 2, repr.plot.height = 10)
umap_disease %>%
    split(f = .$sex) %>%
    imap(
        ~{
            .x %>%
                tidyr::pivot_longer(-c(rowId, sex, X1, X2), names_to = "Disease", values_to = "Diagnosis") %>%
                mutate(Disease = gsub("_", " ", Disease),
                       Diagnosis = ifelse(Diagnosis == 1, "Yes", "No")) %>%
                ggplot(aes(X1, X2)) +
                geom_point(aes(color = Diagnosis, 
                               shape = ifelse(Diagnosis == "Yes", "*", "."),
                               alpha = ifelse(Diagnosis == "Yes", .5, .1)),
                           na.rm = TRUE) +
                scale_shape_identity() +
                scale_alpha_identity() +
                scale_color_manual(values = c("grey", "red")) +
                guides(color = guide_legend(override.aes = list(alpha = 1))) +
                geom_point(data = archdat_umap[[.y]], aes(fill = archnam), shape = 23, size = 3) +
                facet_wrap(~Disease, nrow = 2) +
                theme_bw() +
                labs(title = .x$sex[1], x = "UMAP1", y = "UMAP2", fill = paste("Archetypes -", .y))
        }
    ) %>%
    patchwork::wrap_plots(ncol = 1, guides = "collect")

arch_disease <- umap_disease %>%
    select(-c(X1, X2)) %>%
    pivot_longer(-c(sex, rowId), names_to = "Disease", values_to = "Diagnosis") %>%
    split(f = .$sex) %>%
    imap(~inner_join(.x, arch_alphas_long[[.y]], by = "rowId"))
map(arch_disease, head)
## $Men
## # A tibble: 6 x 6
##   sex   rowId Disease Diagnosis name       value
##   <chr> <dbl> <chr>       <dbl> <chr>      <dbl>
## 1 Men    3785 ami             0 Cluster 1 0     
## 2 Men    3785 ami             0 Cluster 2 0.100 
## 3 Men    3785 ami             0 Cluster 3 0.360 
## 4 Men    3785 ami             0 Cluster 4 0.322 
## 5 Men    3785 ami             0 Cluster 5 0.124 
## 6 Men    3785 ami             0 Cluster 6 0.0935
## 
## $Women
## # A tibble: 6 x 6
##   sex   rowId Disease Diagnosis name       value
##   <chr> <dbl> <chr>       <dbl> <chr>      <dbl>
## 1 Women 13024 ami             0 Cluster 1 0.0373
## 2 Women 13024 ami             0 Cluster 2 0     
## 3 Women 13024 ami             0 Cluster 3 0     
## 4 Women 13024 ami             0 Cluster 4 0.105 
## 5 Women 13024 ami             0 Cluster 5 0.0833
## 6 Women 13024 ami             0 Cluster 6 0.322
arch_logreg <- arch_disease %>%
    map_dfr(
        ~{
            .x %>%
                mutate(
                    prob = ifelse(value == 0, .Machine$double.eps, value)
                ) %>%
                split(f = .$Disease) %>%
                map_dfr(
                    function(DX){
                        DX %>%
                            split(f = .$name) %>%
                            map_dfr(
                                function(ARCH){
                                    suppressWarnings({
                                        glm(Diagnosis ~ value, data = ARCH, family = binomial) %>%
                                            broom::tidy()
                                    })
                                },
                                .id = "Archetype"
                            )
                    },
                    .id = "Disease"
                )
        },
        .id = "sex"
    )


head(arch_logreg)
## # A tibble: 6 x 8
##   sex   Disease Archetype term        estimate std.error statistic  p.value
##   <chr> <chr>   <chr>     <chr>          <dbl>     <dbl>     <dbl>    <dbl>
## 1 Men   ami     Cluster 1 (Intercept)   -2.23      0.120    -18.6  2.71e-77
## 2 Men   ami     Cluster 1 value         -2.00      1.01      -1.99 4.64e- 2
## 3 Men   ami     Cluster 2 (Intercept)   -2.50      0.146    -17.1  2.20e-65
## 4 Men   ami     Cluster 2 value          0.870     0.590      1.47 1.40e- 1
## 5 Men   ami     Cluster 3 (Intercept)   -2.04      0.139    -14.7  3.41e-49
## 6 Men   ami     Cluster 3 value         -2.36      0.791     -2.98 2.84e- 3
options(repr.plot.width = 8, repr.plot.height = 20)
arch_logreg %>%
    filter(term == "value") %>%
    filter(p.value < .1) %>%
    mutate(ci = qnorm(1 - .05/2) * std.error,
           conf.low = estimate - ci, conf.high = estimate + ci) %>%
    ggplot(aes(estimate, interaction(Disease, Archetype))) +
    geom_vline(xintercept = 0, lty = "dashed") +
    geom_linerange(aes(xmin = conf.low, xmax = conf.high)) +
    geom_point(aes(color = Archetype)) +
    scale_y_discrete(guide = ggh4x::guide_axis_nested()) +
    facet_wrap(~sex, nrow = 1, scales = "free") +
    theme_bw() +
    theme(legend.position = "none") +
    labs(title = "Significant cross-sectional associations with diseases\n", 
         x = "Log OR per 1% increase\nin archetypal probability", y = NULL)

### STROKE

diseases <- bbdd_covar %>% select(rowId, age, BMI,BMI, HbA1c, Glucose, Leukocytes, Monocytes,
                                         cLDL, cHDL, Tg, DBP, SBP, TimeT2DM, Ferritin,  t.ep_StrokeI, i.ep_StrokeI,
                                  t.ep_AMI, i.ep_AMI, t.ep_Angor, i.ep_Angor, t.ep_TIA, i.ep_TIA, sex_female, ami, stroke, angor, tia)

diseases <- diseases %>% group_by(sex_female) %>% group_split() %>% map2(arch_maxprob, function(dat, arch){
dat <- arch %>% select(rowId, name) %>% left_join(dat, by="rowId")
})

diseases <- diseases %>% map(function(dat){dat <- dat %>% filter(ami == 0, stroke == 0, angor==0, tia == 0)})
diseases  %>% map(function(data){
  autoplot(survfit(Surv(t.ep_StrokeI, i.ep_StrokeI) ~ name, data=data), main="Stroke survival", censor.size = 0, surv.geom = "line",
           conf.int.fill=NULL)
})
## [[1]]

## 
## [[2]]

clusters_men <- diseases[[1]] 
clusters_men$name <- as.factor(clusters_men$name)
clusters_men$name <- relevel(clusters_men$name, ref= "Cluster 1") #REFERENCIA MAYOR SUPERVIVENCIA

clusters_women <- diseases[[2]]
clusters_women <- clusters_women %>% filter(name != "Cluster 3")
clusters_women$name <- as.factor(clusters_women$name)
clusters_women$name <- relevel(clusters_women$name, ref = "Cluster 4")

clusters = list(clusters_men, clusters_women)

diseases_surv <- clusters %>%
  map(function(data) {mod = coxph(Surv(t.ep_StrokeI, i.ep_StrokeI)~age + BMI+ name, data=data)
      tidy(mod, conf.int= TRUE, exponentiate=TRUE)})
head(diseases_surv)
## [[1]]
## # A tibble: 8 x 7
##   term          estimate std.error statistic    p.value conf.low conf.high
##   <chr>            <dbl>     <dbl>     <dbl>      <dbl>    <dbl>     <dbl>
## 1 age               1.08    0.0164     4.53  0.00000597    1.04       1.11
## 2 BMI               1.00    0.0379     0.126 0.900         0.933      1.08
## 3 nameCluster 2     1.56    1.12       0.400 0.689         0.175     14.0 
## 4 nameCluster 3     2.44    1.06       0.839 0.401         0.304     19.6 
## 5 nameCluster 4     2.53    1.07       0.866 0.387         0.310     20.6 
## 6 nameCluster 5     3.42    1.12       1.10  0.272         0.381     30.7 
## 7 nameCluster 6     2.45    1.04       0.859 0.390         0.318     18.8 
## 8 nameCluster 7     4.09    1.08       1.30  0.192         0.492     34.0 
## 
## [[2]]
## # A tibble: 8 x 7
##   term          estimate std.error statistic  p.value conf.low conf.high
##   <chr>            <dbl>     <dbl>     <dbl>    <dbl>    <dbl>     <dbl>
## 1 age              1.05     0.0139     3.37  0.000740    1.02      1.08 
## 2 BMI              0.944    0.0263    -2.20  0.0275      0.896     0.994
## 3 nameCluster 1    5.43     0.691      2.45  0.0143      1.40     21.1  
## 4 nameCluster 2    4.77     0.645      2.42  0.0154      1.35     16.9  
## 5 nameCluster 5    3.87     0.660      2.05  0.0401      1.06     14.1  
## 6 nameCluster 6    2.10     0.660      1.12  0.261       0.576     7.64 
## 7 nameCluster 7    1.54     0.731      0.594 0.552       0.368     6.47 
## 8 nameCluster 8    5.56     0.644      2.66  0.00774     1.57     19.7
options(repr.plot.width = 9, repr.plot.height = 3)
diseases_surv[[1]] %>%
    ggplot(aes(estimate, term)) +
    geom_vline(xintercept = 1, lty = "dashed") +
    geom_linerange(aes(xmin = conf.low, xmax = conf.high)) +
    geom_point(aes(color = term)) +
    theme_bw() +
    theme(legend.position = "none") +
    xlim(-1,6) +
    labs(x = "Hazard ratio")
## Warning: Removed 6 rows containing missing values (geom_segment).

options(repr.plot.width = 9, repr.plot.height = 3)
diseases_surv[[2]] %>%
    ggplot(aes(estimate, term)) +
    geom_vline(xintercept = 1, lty = "dashed") +
    geom_linerange(aes(xmin = conf.low, xmax = conf.high)) +
    geom_point(aes(color = term)) +
    theme_bw() +
    theme(legend.position = "none") +
    xlim(-1, 6) +
    labs(x = "Hazard ratio")
## Warning: Removed 6 rows containing missing values (geom_segment).

AMI

diseases  %>% map(function(data){
  autoplot(survfit(Surv(t.ep_AMI, i.ep_AMI) ~ name, data=data), main="AMI survival", censor.size = 0, surv.geom = "line",
           conf.int.fill=NULL)
})
## [[1]]

## 
## [[2]]

clusters_men <- diseases[[1]] 
clusters_men$name <- as.factor(clusters_men$name)
clusters_men$name <- relevel(clusters_men$name, ref="Cluster 5") #REFERENCIA MAYOR SUPERVIVENCIA

clusters_women <- diseases[[2]]
clusters_women <- clusters_women %>% filter(name != "Cluster 8", name != "Cluster 3")
clusters_women$name <- as.factor(clusters_women$name)
clusters_women$name <- relevel(clusters_women$name, ref = "Cluster 7")

clusters = list(clusters_men, clusters_women)

diseases_surv <- clusters %>%
  map(function(data) {mod = coxph(Surv(t.ep_AMI, i.ep_AMI)~age + BMI+ name, data=data)
      tidy(mod, conf.int= TRUE, exponentiate=TRUE)})
head(diseases_surv)
## [[1]]
## # A tibble: 8 x 7
##   term          estimate std.error statistic p.value conf.low conf.high
##   <chr>            <dbl>     <dbl>     <dbl>   <dbl>    <dbl>     <dbl>
## 1 age              1.04     0.0183     2.03   0.0427    1.00       1.08
## 2 BMI              0.946    0.0471    -1.18   0.237     0.863      1.04
## 3 nameCluster 1    2.79     1.23       0.837  0.403     0.252     30.8 
## 4 nameCluster 2    2.17     1.12       0.692  0.489     0.242     19.4 
## 5 nameCluster 3    2.69     1.07       0.923  0.356     0.330     21.9 
## 6 nameCluster 4    3.69     1.06       1.23   0.218     0.461     29.5 
## 7 nameCluster 6    1.28     1.10       0.223  0.823     0.149     10.9 
## 8 nameCluster 7    2.51     1.15       0.796  0.426     0.261     24.1 
## 
## [[2]]
## # A tibble: 7 x 7
##   term          estimate std.error statistic p.value conf.low conf.high
##   <chr>            <dbl>     <dbl>     <dbl>   <dbl>    <dbl>     <dbl>
## 1 age               1.05    0.0256    1.88    0.0602   0.998       1.10
## 2 BMI               1.02    0.0437    0.368   0.713    0.933       1.11
## 3 nameCluster 1     4.67    1.22      1.26    0.208    0.423      51.5 
## 4 nameCluster 2     5.39    1.10      1.53    0.125    0.626      46.4 
## 5 nameCluster 4     2.24    1.23      0.658   0.511    0.203      24.7 
## 6 nameCluster 5     1.14    1.41      0.0917  0.927    0.0711     18.2 
## 7 nameCluster 6     4.00    1.08      1.28    0.200    0.480      33.3
options(repr.plot.width = 9, repr.plot.height = 3)
diseases_surv[[1]] %>%
    ggplot(aes(estimate, term)) +
    geom_vline(xintercept = 1, lty = "dashed") +
    geom_linerange(aes(xmin = conf.low, xmax = conf.high)) +
    geom_point(aes(color = term)) +
    theme_bw() +
    theme(legend.position = "none") +
    xlim(-1,6) +
    labs(x = "Hazard ratio")
## Warning: Removed 6 rows containing missing values (geom_segment).

options(repr.plot.width = 9, repr.plot.height = 3)
diseases_surv[[2]] %>%
    ggplot(aes(estimate, term)) +
    geom_vline(xintercept = 1, lty = "dashed") +
    geom_linerange(aes(xmin = conf.low, xmax = conf.high)) +
    geom_point(aes(color = term)) +
    theme_bw() +
    theme(legend.position = "none") +
    xlim(-1, 6) +
    labs(x = "Hazard ratio")
## Warning: Removed 5 rows containing missing values (geom_segment).

TIA

diseases  %>% map(function(data){
  autoplot(survfit(Surv(t.ep_TIA, i.ep_TIA) ~ name, data=data), main="TIA survival", censor.size = 0, surv.geom = "line",
           conf.int.fill=NULL)
})
## [[1]]

## 
## [[2]]

clusters_men <- diseases[[1]] 
clusters_men <- clusters_men %>%filter(name != "Cluster 5", name != "Cluster 1")
clusters_men$name <- as.factor(clusters_men$name)
clusters_men$name <- relevel(clusters_men$name, ref="Cluster 2") #REFERENCIA MAYOR SUPERVIVENCIA

clusters_women <- diseases[[2]]
clusters_women$name <- as.factor(clusters_women$name)
clusters_women$name <- relevel(clusters_women$name, ref = "Cluster 3")

clusters = list(clusters_men, clusters_women)

diseases_surv <- clusters %>%
  map(function(data) {mod = coxph(Surv(t.ep_TIA, i.ep_TIA)~age + BMI+ name, data=data)
      tidy(mod, conf.int= TRUE, exponentiate=TRUE)})
head(diseases_surv)
## [[1]]
## # A tibble: 6 x 7
##   term          estimate std.error statistic p.value conf.low conf.high
##   <chr>            <dbl>     <dbl>     <dbl>   <dbl>    <dbl>     <dbl>
## 1 age              1.05     0.0219     2.06   0.0395    1.00       1.09
## 2 BMI              0.974    0.0556    -0.466  0.641     0.874      1.09
## 3 nameCluster 3    2.69     0.803      1.23   0.217     0.558     13.0 
## 4 nameCluster 4    1.39     0.914      0.362  0.718     0.232      8.34
## 5 nameCluster 6    1.54     0.818      0.527  0.598     0.310      7.64
## 6 nameCluster 7    2.43     0.914      0.970  0.332     0.405     14.6 
## 
## [[2]]
## # A tibble: 9 x 7
##   term          estimate std.error statistic p.value conf.low conf.high
##   <chr>            <dbl>     <dbl>     <dbl>   <dbl>    <dbl>     <dbl>
## 1 age              1.04     0.0146     2.62  0.00867    1.01       1.07
## 2 BMI              0.972    0.0279    -1.03  0.301      0.920      1.03
## 3 nameCluster 1    4.32     1.10       1.33  0.183      0.502     37.1 
## 4 nameCluster 2    3.80     1.05       1.27  0.206      0.481     30.0 
## 5 nameCluster 4    2.10     1.10       0.677 0.498      0.245     18.1 
## 6 nameCluster 5    3.18     1.07       1.08  0.280      0.390     26.0 
## 7 nameCluster 6    2.86     1.05       1.01  0.314      0.369     22.2 
## 8 nameCluster 7    2.23     1.08       0.741 0.459      0.268     18.6 
## 9 nameCluster 8    1.96     1.12       0.601 0.548      0.219     17.5
options(repr.plot.width = 9, repr.plot.height = 3)
diseases_surv[[1]] %>%
    ggplot(aes(estimate, term)) +
    geom_vline(xintercept = 1, lty = "dashed") +
    geom_linerange(aes(xmin = conf.low, xmax = conf.high)) +
    geom_point(aes(color = term)) +
    theme_bw() +
    theme(legend.position = "none") +
    xlim(-1,6) +
    labs(x = "Hazard ratio")
## Warning: Removed 4 rows containing missing values (geom_segment).

options(repr.plot.width = 9, repr.plot.height = 3)
diseases_surv[[2]] %>%
    ggplot(aes(estimate, term)) +
    geom_vline(xintercept = 1, lty = "dashed") +
    geom_linerange(aes(xmin = conf.low, xmax = conf.high)) +
    geom_point(aes(color = term)) +
    theme_bw() +
    theme(legend.position = "none") +
    xlim(-1, 6) +
    labs(x = "Hazard ratio")
## Warning: Removed 7 rows containing missing values (geom_segment).

Angor

diseases  %>% map(function(data){
  autoplot(survfit(Surv(t.ep_Angor, i.ep_Angor) ~ name, data=data), main="Angor survival", censor.size = 0, surv.geom = "line",
           conf.int.fill=NULL)
})
## [[1]]

## 
## [[2]]

clusters_men <- diseases[[1]] 
clusters_men <- clusters_men %>% filter (name != "Cluster 7")
clusters_men$name <- as.factor(clusters_men$name)
clusters_men$name <- relevel(clusters_men$name, ref="Cluster 2") #REFERENCIA MAYOR SUPERVIVENCIA

clusters_women <- diseases[[2]]
clusters_women$name <- as.factor(clusters_women$name)
clusters_women$name <- relevel(clusters_women$name, ref = "Cluster 8")

clusters = list(clusters_men, clusters_women)

diseases_surv <- clusters %>%
  map(function(data) {mod = coxph(Surv(t.ep_Angor, i.ep_Angor)~age + BMI+ name, data=data)
      tidy(mod, conf.int= TRUE, exponentiate=TRUE)})
head(diseases_surv)
## [[1]]
## # A tibble: 7 x 7
##   term          estimate std.error statistic p.value conf.low conf.high
##   <chr>            <dbl>     <dbl>     <dbl>   <dbl>    <dbl>     <dbl>
## 1 age               1.03    0.0169    1.83    0.0667   0.998       1.07
## 2 BMI               1.08    0.0342    2.25    0.0244   1.01        1.15
## 3 nameCluster 1     1.09    1.23      0.0674  0.946    0.0983     12.0 
## 4 nameCluster 3     2.89    0.804     1.32    0.187    0.597      13.9 
## 5 nameCluster 4     2.74    0.817     1.23    0.218    0.552      13.6 
## 6 nameCluster 5     2.75    0.917     1.10    0.271    0.455      16.6 
## 7 nameCluster 6     3.13    0.761     1.50    0.134    0.704      13.9 
## 
## [[2]]
## # A tibble: 9 x 7
##   term          estimate std.error statistic p.value conf.low conf.high
##   <chr>            <dbl>     <dbl>     <dbl>   <dbl>    <dbl>     <dbl>
## 1 age              1.01     0.0187     0.697   0.486    0.977      1.05
## 2 BMI              0.970    0.0371    -0.827   0.408    0.902      1.04
## 3 nameCluster 1    5.92     1.16       1.54    0.124    0.614     57.1 
## 4 nameCluster 2    3.39     1.12       1.09    0.275    0.379     30.4 
## 5 nameCluster 3    2.00     1.42       0.489   0.625    0.125     32.0 
## 6 nameCluster 4    4.86     1.10       1.44    0.149    0.567     41.7 
## 7 nameCluster 5    1.99     1.23       0.561   0.575    0.180     21.9 
## 8 nameCluster 6    2.71     1.10       0.909   0.363    0.316     23.2 
## 9 nameCluster 7    3.32     1.12       1.07    0.283    0.370     29.8
options(repr.plot.width = 9, repr.plot.height = 3)
diseases_surv[[1]] %>%
    ggplot(aes(estimate, term)) +
    geom_vline(xintercept = 1, lty = "dashed") +
    geom_linerange(aes(xmin = conf.low, xmax = conf.high)) +
    geom_point(aes(color = term)) +
    theme_bw() +
    theme(legend.position = "none") +
    xlim(-1,6) +
    labs(x = "Hazard ratio")
## Warning: Removed 5 rows containing missing values (geom_segment).

options(repr.plot.width = 9, repr.plot.height = 3)
diseases_surv[[2]] %>%
    ggplot(aes(estimate, term)) +
    geom_vline(xintercept = 1, lty = "dashed") +
    geom_linerange(aes(xmin = conf.low, xmax = conf.high)) +
    geom_point(aes(color = term)) +
    theme_bw() +
    theme(legend.position = "none") +
    xlim(-1, 6) +
    labs(x = "Hazard ratio")
## Warning: Removed 7 rows containing missing values (geom_segment).

GMM MODEL

men <- rep(c("Male"), nrow(bbdd_resid[[1]]))
list_men <- cbind(bbdd_resid[[1]], men)
names <- colnames(bbdd_resid[[1]])
colnames(list_men) <- c(names, "sex_female")

women <- rep(c("Female"), nrow(bbdd_resid[[2]]))
list_women <- cbind(bbdd_resid[[2]], women)
colnames(list_women) <- c(names, "sex_female")

bbdd_resid <- rbind(list_men, list_women)
BIC_curve <- bbdd_resid %>%
  select(-rowId) %>% 
  group_by(sex_female, .add = T) %>%
  group_split() %>% 
  map_dfr(function(strat){
    res <- Optimal_Clusters_GMM(data = strat %>% select(-sex_female),
                                max_clusters = 1:20, criterion = "BIC",
                                em_iter = 10, plot_data = FALSE)
    data.frame(sex_female = strat$sex_female[1],
               nclus = 1:20,
               BIC = as.vector(res))
  })
ggplot(BIC_curve,
       aes(nclus, BIC)) +
  geom_line(group = 1) +
  geom_point() +
  scale_x_continuous(breaks = 1:20) +
  facet_wrap(~ sex_female, scales = "free") +
  labs(x = "N clusters") +
  theme_bw()

ggplot(data = BIC_curve %>%
         mutate(bicgrad = abs(BIC - lag(BIC))) %>%
         ungroup %>%
         drop_na,
       aes(nclus - .5, bicgrad)) +
  geom_col(alpha = .3, color = "darkgrey") +
  geom_line(group = 1) +
  geom_point() +
  scale_x_continuous(breaks = 1:20) +
  facet_wrap(~ sex_female, scales = "free") +
  labs(x = "N clusters", y = "BIC change") +
  theme_bw()

fxclus <- function(x, k){
    mod <- GMM(data = x, gaussian_comps = k, em_iter = 10)
    pred <- predict_GMM(x, CENTROIDS = mod$centroids, 
                        COVARIANCE = mod$covariance_matrices, 
                        WEIGHTS = mod$weights)
    return(list(cluster = as.integer(pred$cluster_labels)))
}
res_sil <- bbdd_resid %>%
  select(-rowId) %>% 
  group_by(sex_female, .add = T) %>%
  group_split() %>%
  map_dfr(function(strat){
    map_dfr(2:10, function(k){
      map_dfr(1:100, function(iter){
        set.seed(iter)
        scaledat <- strat %>%
          select(-sex_female) %>%
          slice_sample(n = 1000)
        d <- dist(as.matrix(scaledat))
        res <- fxclus(scaledat, k)
        ave_sil <- cluster::silhouette(res$cluster, d)
        data.frame(nclus = k, mean_sil_width = mean(ave_sil[,"sil_width"]))
      })
    })
  },
  .id = "sex_female")
res_sil %>%
  group_by(sex_female, nclus) %>%
  summarise(mean_sw = mean(mean_sil_width),
            se_sw = sd(mean_sil_width)/(sqrt(length(mean_sil_width))),
            .groups = "drop") %>%
  ggplot(aes(nclus, mean_sw)) +
  geom_line(group = 1) +
  geom_linerange(aes(ymin = mean_sw - se_sw, ymax = mean_sw + se_sw)) +
  geom_point() +
  scale_x_continuous(breaks = 1:10) +
  facet_wrap(~ sex_female, nrow = 1, scales = "free") +
  theme_bw() +
  labs(x = "Number of clusters", y = "Mean silhouette width")

bbdd_resid %>%
  select(-rowId) %>%
  group_by(sex_female, .add = T) %>%
  group_split() %>%
  map_dfr(function(strat){
    map_dfr(2:20, function(nclus){
      scaledat <- strat %>%
          select(-sex_female)
      data.frame(cluster = fxclus(scaledat, nclus)$cluster) %>%
        count(cluster) %>%
        mutate(n = 100 * (n/sum(n))) %>%
        slice_min(n, with_ties = FALSE) %>%
        mutate(nclus)
    })
  },
  .id = 'sex_female') %>%
  ggplot(aes(nclus, n)) +
  geom_line(group = 1) +
  geom_point() +
  geom_hline(yintercept = 1, linetype = "dashed", color = "red") +
  facet_wrap(~sex_female, nrow = 1) +
  theme_bw() +
  labs(x = "Number of clusters", y = "Size of smallest cluster (%)")

umap_embed
## $Men
## # A tibble: 1,136 x 3
##    rowId      X1      X2
##    <dbl>   <dbl>   <dbl>
##  1  3785  0.161   0.442 
##  2  4678  0.533  -0.696 
##  3  6049  0.912   0.225 
##  4 19645 -0.948   1.90  
##  5 25515 -0.737  -0.816 
##  6 25754  1.85    0.527 
##  7 27214 -0.646   0.937 
##  8 27492  0.229   0.172 
##  9 30046 -0.656   1.56  
## 10 35119  0.0179  0.0525
## # ... with 1,126 more rows
## 
## $Women
## # A tibble: 1,500 x 3
##    rowId      X1      X2
##    <dbl>   <dbl>   <dbl>
##  1 13024 -0.795   0.638 
##  2 16592  0.596   1.66  
##  3 18652 -0.0850  1.09  
##  4 20405  0.0744  0.920 
##  5 22298  0.391   1.26  
##  6 37238  0.358   0.794 
##  7 46148 -0.763   1.09  
##  8 46159  0.617  -0.0384
##  9 49669 -1.01    0.960 
## 10 54634 -1.17    1.07  
## # ... with 1,490 more rows
bbdd_umap_50_resid <- umap_embed
names(bbdd_umap_50_resid) <- c("Men", "Women")


men <- matrix(data= c("Men"), nrow=nrow(bbdd_umap_50_resid$Men))
colnames(men) =c("sex")

bbdd_umap_50_resid_men <- cbind(bbdd_umap_50_resid$Men, men)

women <- matrix(data= c("Women"), nrow=nrow(bbdd_umap_50_resid$Women))
colnames(women) =c("sex")

bbdd_umap_50_resid_women <- cbind(bbdd_umap_50_resid$Women, women)

bbdd_umap_50_resid <- rbind(bbdd_umap_50_resid_women, bbdd_umap_50_resid_men)

bbdd_umap_50_resid_saved <- bbdd_umap_50_resid

6 clusters

bbdd_umap_50_resid <- bbdd_umap_50_resid_saved
set.seed(12345)
gmm_res <- bbdd_resid %>%
  select(-rowId) %>%
  group_by(sex_female, .add = T) %>%
  group_split() %>%
  map(function(strat){
    scaledat <- strat %>%
      select(-sex_female)
    mod <- GMM(data = scaledat,
               gaussian_comps = 6,
               em_iter = 10)
    pred <- predict_GMM(scaledat,
                        CENTROIDS = mod$centroids,
                        COVARIANCE = mod$covariance_matrices, 
                        WEIGHTS = mod$weights)
    return(list(mod = mod,
                pred = pred,
                grp = strat$sex_female[1]))
  })

gmm_res <- list(gmm_res[[2]], gmm_res[[1]])

bbdd_umap_50_resid <- bbdd_umap_50_resid %>%
  group_by(sex, .add = T) %>%
  group_split() %>%
  map2_dfr(gmm_res,
           ~.x %>%
             bind_cols(cluster_gmm = as.factor(.y$pred$cluster_labels)))


ggplot(bbdd_umap_50_resid) +
  geom_point(aes(x = X1, y = X2, colour = cluster_gmm)) +
  facet_grid(. ~ sex) +
  theme_bw()

gmm_res %>%
    map(
        function(strata){
            data.frame(MaxProb = apply(strata$pred$cluster_proba, 1, max)) %>%
                ggplot(aes(MaxProb)) +
                geom_histogram(bins = 10, fill = "lightblue", color = "black")
        }
    ) %>%
    wrap_plots &
    theme_bw() &
    labs(y = "Count")

gmm_res %>%
    map_dfr(~data.frame(prob80 = apply(.x$pred$cluster_proba, 1, max) > .8), .id = "sex") %>%
    ggplot(aes(sex, fill = prob80)) +
    geom_bar(stat="count", position ="fill") +
    theme_bw() +
    scale_y_continuous(labels = scales::percent) +
    labs(x = NULL, y = "%", fill = "MaxP > 80%")

gmm_res %>%
    map_dfr(~data.frame(cluster = .x$pred$cluster_labels) %>%
                count(cluster, name = "size") %>%
                mutate(prop = size / sum(size),
                       sex = .x$grp)) %>%
    ggplot(aes(cluster, prop * 100)) +
    geom_col(fill = "lightblue", color = "black") +
    facet_wrap(~sex) +
    theme_bw() +
    labs(x = "Cluster", y = "%")

gmm_res %>%
    map2(bbdd %>%
           select(HbA1c, Leukocytes, Monocytes, cLDL, Tg, cHDL, DBP, SBP, Glucose, TimeT2DM, Ferritin,
                  sex_female) %>% 
           group_by(sex_female, .add = T) %>%
           group_split(),
         function(strata, data){
           varnames <- names(select(data, -c(sex_female)))
           centroid_dat <- strata$mod$centroids %>%
             data.frame %>%
             setNames(paste0(varnames, "_mean"))
           sd_dat <- strata$mod$covariance_matrices %>%
             sqrt %>%
             data.frame %>%
             setNames(paste0(varnames, "_sd"))
           cbind(centroid_dat, sd_dat) %>%
             tibble::rownames_to_column("component") %>%
             pivot_longer(-component, names_sep = "_", names_to = c("name", ".value")) %>%
             mutate(sd = qnorm(1 - (0.05/2)) * sd) %>%
             ggplot(aes(mean, component)) +
             geom_vline(xintercept = 0, linetype = "dashed", color = "gray") +
             geom_vline(xintercept = c(-1, 1) * qnorm(1 - .2/2), lty = "dashed", color = "darkred") +
             geom_linerange(aes(xmin = mean - sd, xmax = mean + sd)) +
             geom_point(size = 2) +
             facet_wrap(~name, ncol = 3, scales = "free_x") +
             labs(title = strata$grp, x = NULL, y = NULL) +
             theme_bw()
         }) %>%
  wrap_plots(nrow = 1)

bbdd %>%
  select(age, BMI, HbA1c, Leukocytes, Monocytes, cLDL, Tg, cHDL, DBP, SBP, Glucose, TimeT2DM, Ferritin,
         sex_female) %>%
  group_by(sex_female, .add = T) %>%
  group_split() %>%
  map2(bbdd_umap_50_resid %>%
         group_by(sex_female = sex, .add = T) %>%
         group_split(),
       ~.x %>%
         bind_cols(.y %>% select(-sex_female, -X1, -X2)) %>%
         pivot_longer(-c(cluster_gmm, sex, sex_female)) %>%
         group_by(name) %>%
         mutate(mval = mean(value)) %>%
         ungroup %>%
         mutate(name = factor(name,
                              levels = c('age', 'BMI', 'HbA1c', 'Glucose', 'Leukocytes', 'Monocytes',
                                         'cLDL', 'cHDL', 'Tg', 'DBP', 'SBP', 'TimeT2DM', 'Ferritin'))) %>%
         ggplot(aes(cluster_gmm, value)) +
         geom_boxplot(aes(group = cluster_gmm, colour = cluster_gmm)) +
         geom_hline(aes(yintercept = mval), linetype = "dashed", color = "red") +
         facet_wrap(~name, scales = "free", nrow = 1) +
         theme_bw() +
         theme(legend.position = 'none') +
         labs(title = .x$sex[1], x = NULL, y = NULL)) %>%
  modify_at(2, ~.x + labs(x = "Cluster")) %>%
  wrap_plots(ncol = 1)
## Warning: Unknown or uninitialised column: `sex`.
## Unknown or uninitialised column: `sex`.

gmm_6 <- gmm_res

7 clusters

bbdd_umap_50_resid <- bbdd_umap_50_resid_saved
set.seed(12345)
gmm_res <- bbdd_resid %>%
  select(-rowId) %>%
  group_by(sex_female, .add = T) %>%
  group_split() %>%
  map(function(strat){
    scaledat <- strat %>%
      select(-sex_female)
    mod <- GMM(data = scaledat,
               gaussian_comps = 7,
               em_iter = 10)
    pred <- predict_GMM(scaledat,
                        CENTROIDS = mod$centroids,
                        COVARIANCE = mod$covariance_matrices, 
                        WEIGHTS = mod$weights)
    return(list(mod = mod,
                pred = pred,
                grp = strat$sex_female[1]))
  })

gmm_res <- list(gmm_res[[2]], gmm_res[[1]])

bbdd_umap_50_resid <- bbdd_umap_50_resid %>%
  group_by(sex, .add = T) %>%
  group_split() %>%
  map2_dfr(gmm_res,
           ~.x %>%
             bind_cols(cluster_gmm = as.factor(.y$pred$cluster_labels)))


ggplot(bbdd_umap_50_resid) +
  geom_point(aes(x = X1, y = X2, colour = cluster_gmm)) +
  facet_grid(. ~ sex) +
  theme_bw()

gmm_res %>%
    map(
        function(strata){
            data.frame(MaxProb = apply(strata$pred$cluster_proba, 1, max)) %>%
                ggplot(aes(MaxProb)) +
                geom_histogram(bins = 10, fill = "lightblue", color = "black")
        }
    ) %>%
    wrap_plots &
    theme_bw() &
    labs(y = "Count")

gmm_res %>%
    map_dfr(~data.frame(prob80 = apply(.x$pred$cluster_proba, 1, max) > .8), .id = "sex") %>%
    ggplot(aes(sex, fill = prob80)) +
    geom_bar(stat="count", position ="fill") +
    theme_bw() +
    scale_y_continuous(labels = scales::percent) +
    labs(x = NULL, y = "%", fill = "MaxP > 80%")

gmm_res %>%
    map_dfr(~data.frame(cluster = .x$pred$cluster_labels) %>%
                count(cluster, name = "size") %>%
                mutate(prop = size / sum(size),
                       sex = .x$grp)) %>%
    ggplot(aes(cluster, prop * 100)) +
    geom_col(fill = "lightblue", color = "black") +
    facet_wrap(~sex) +
    theme_bw() +
    labs(x = "Cluster", y = "%")

gmm_res %>%
    map2(bbdd %>%
           select(HbA1c, Leukocytes, Monocytes, cLDL, Tg, cHDL, DBP, SBP, Glucose, TimeT2DM, Ferritin,
                  sex_female) %>% 
           group_by(sex_female, .add = T) %>%
           group_split(),
         function(strata, data){
           varnames <- names(select(data, -c(sex_female)))
           centroid_dat <- strata$mod$centroids %>%
             data.frame %>%
             setNames(paste0(varnames, "_mean"))
           sd_dat <- strata$mod$covariance_matrices %>%
             sqrt %>%
             data.frame %>%
             setNames(paste0(varnames, "_sd"))
           cbind(centroid_dat, sd_dat) %>%
             tibble::rownames_to_column("component") %>%
             pivot_longer(-component, names_sep = "_", names_to = c("name", ".value")) %>%
             mutate(sd = qnorm(1 - (0.05/2)) * sd) %>%
             ggplot(aes(mean, component)) +
             geom_vline(xintercept = 0, linetype = "dashed", color = "gray") +
             geom_vline(xintercept = c(-1, 1) * qnorm(1 - .2/2), lty = "dashed", color = "darkred") +
             geom_linerange(aes(xmin = mean - sd, xmax = mean + sd)) +
             geom_point(size = 2) +
             facet_wrap(~name, ncol = 3, scales = "free_x") +
             labs(title = strata$grp, x = NULL, y = NULL) +
             theme_bw()
         }) %>%
  wrap_plots(nrow = 1)

bbdd %>%
  select(age, BMI, HbA1c, Leukocytes, Monocytes, cLDL, Tg, cHDL, DBP, SBP, Glucose, TimeT2DM, Ferritin,
         sex_female) %>%
  group_by(sex_female, .add = T) %>%
  group_split() %>%
  map2(bbdd_umap_50_resid %>%
         group_by(sex_female = sex, .add = T) %>%
         group_split(),
       ~.x %>%
         bind_cols(.y %>% select(-sex_female, -X1, -X2)) %>%
         pivot_longer(-c(cluster_gmm, sex, sex_female)) %>%
         group_by(name) %>%
         mutate(mval = mean(value)) %>%
         ungroup %>%
         mutate(name = factor(name,
                              levels = c('age', 'BMI', 'HbA1c', 'Glucose', 'Leukocytes', 'Monocytes',
                                         'cLDL', 'cHDL', 'Tg', 'DBP', 'SBP', 'TimeT2DM', 'Ferritin'))) %>%
         ggplot(aes(cluster_gmm, value)) +
         geom_boxplot(aes(group = cluster_gmm, colour = cluster_gmm)) +
         geom_hline(aes(yintercept = mval), linetype = "dashed", color = "red") +
         facet_wrap(~name, scales = "free", nrow = 1) +
         theme_bw() +
         theme(legend.position = 'none') +
         labs(title = .x$sex[1], x = NULL, y = NULL)) %>%
  modify_at(2, ~.x + labs(x = "Cluster")) %>%
  wrap_plots(ncol = 1)
## Warning: Unknown or uninitialised column: `sex`.
## Unknown or uninitialised column: `sex`.

gmm_7 <- gmm_res
gmm_res <- gmm_7

8 clusters

bbdd_umap_50_resid <- bbdd_umap_50_resid_saved
set.seed(12345)
gmm_res <- bbdd_resid %>%
  select(-rowId) %>%
  group_by(sex_female, .add = T) %>%
  group_split() %>%
  map(function(strat){
    scaledat <- strat %>%
      select(-sex_female)
    mod <- GMM(data = scaledat,
               gaussian_comps = 8,
               em_iter = 10)
    pred <- predict_GMM(scaledat,
                        CENTROIDS = mod$centroids,
                        COVARIANCE = mod$covariance_matrices, 
                        WEIGHTS = mod$weights)
    return(list(mod = mod,
                pred = pred,
                grp = strat$sex_female[1]))
  })

gmm_res <- list(gmm_res[[2]], gmm_res[[1]])

bbdd_umap_50_resid <- bbdd_umap_50_resid %>%
  group_by(sex, .add = T) %>%
  group_split() %>%
  map2_dfr(gmm_res,
           ~.x %>%
             bind_cols(cluster_gmm = as.factor(.y$pred$cluster_labels)))


ggplot(bbdd_umap_50_resid) +
  geom_point(aes(x = X1, y = X2, colour = cluster_gmm)) +
  facet_grid(. ~ sex) +
  theme_bw()

gmm_res %>%
    map(
        function(strata){
            data.frame(MaxProb = apply(strata$pred$cluster_proba, 1, max)) %>%
                ggplot(aes(MaxProb)) +
                geom_histogram(bins = 10, fill = "lightblue", color = "black")
        }
    ) %>%
    wrap_plots &
    theme_bw() &
    labs(y = "Count")

gmm_res %>%
    map_dfr(~data.frame(prob80 = apply(.x$pred$cluster_proba, 1, max) > .8), .id = "sex") %>%
    ggplot(aes(sex, fill = prob80)) +
    geom_bar(stat="count", position ="fill") +
    theme_bw() +
    scale_y_continuous(labels = scales::percent) +
    labs(x = NULL, y = "%", fill = "MaxP > 80%")

gmm_res %>%
    map_dfr(~data.frame(cluster = .x$pred$cluster_labels) %>%
                count(cluster, name = "size") %>%
                mutate(prop = size / sum(size),
                       sex = .x$grp)) %>%
    ggplot(aes(cluster, prop * 100)) +
    geom_col(fill = "lightblue", color = "black") +
    facet_wrap(~sex) +
    theme_bw() +
    labs(x = "Cluster", y = "%")

gmm_res %>%
    map2(bbdd %>%
           select(HbA1c, Leukocytes, Monocytes, cLDL, Tg, cHDL, DBP, SBP, Glucose, TimeT2DM, Ferritin,
                  sex_female) %>% 
           group_by(sex_female, .add = T) %>%
           group_split(),
         function(strata, data){
           varnames <- names(select(data, -c(sex_female)))
           centroid_dat <- strata$mod$centroids %>%
             data.frame %>%
             setNames(paste0(varnames, "_mean"))
           sd_dat <- strata$mod$covariance_matrices %>%
             sqrt %>%
             data.frame %>%
             setNames(paste0(varnames, "_sd"))
           cbind(centroid_dat, sd_dat) %>%
             tibble::rownames_to_column("component") %>%
             pivot_longer(-component, names_sep = "_", names_to = c("name", ".value")) %>%
             mutate(sd = qnorm(1 - (0.05/2)) * sd) %>%
             ggplot(aes(mean, component)) +
             geom_vline(xintercept = 0, linetype = "dashed", color = "gray") +
             geom_vline(xintercept = c(-1, 1) * qnorm(1 - .2/2), lty = "dashed", color = "darkred") +
             geom_linerange(aes(xmin = mean - sd, xmax = mean + sd)) +
             geom_point(size = 2) +
             facet_wrap(~name, ncol = 3, scales = "free_x") +
             labs(title = strata$grp, x = NULL, y = NULL) +
             theme_bw()
         }) %>%
  wrap_plots(nrow = 1)

bbdd %>%
  select(age, BMI, HbA1c, Leukocytes, Monocytes, cLDL, Tg, cHDL, DBP, SBP, Glucose, TimeT2DM, Ferritin,
         sex_female) %>%
  group_by(sex_female, .add = T) %>%
  group_split() %>%
  map2(bbdd_umap_50_resid %>%
         group_by(sex_female = sex, .add = T) %>%
         group_split(),
       ~.x %>%
         bind_cols(.y %>% select(-sex_female, -X1, -X2)) %>%
         pivot_longer(-c(cluster_gmm, sex, sex_female)) %>%
         group_by(name) %>%
         mutate(mval = mean(value)) %>%
         ungroup %>%
         mutate(name = factor(name,
                              levels = c('age', 'BMI', 'HbA1c', 'Glucose', 'Leukocytes', 'Monocytes',
                                         'cLDL', 'cHDL', 'Tg', 'DBP', 'SBP', 'TimeT2DM', 'Ferritin'))) %>%
         ggplot(aes(cluster_gmm, value)) +
         geom_boxplot(aes(group = cluster_gmm, colour = cluster_gmm)) +
         geom_hline(aes(yintercept = mval), linetype = "dashed", color = "red") +
         facet_wrap(~name, scales = "free", nrow = 1) +
         theme_bw() +
         theme(legend.position = 'none') +
         labs(title = .x$sex[1], x = NULL, y = NULL)) %>%
  modify_at(2, ~.x + labs(x = "Cluster")) %>%
  wrap_plots(ncol = 1)
## Warning: Unknown or uninitialised column: `sex`.
## Unknown or uninitialised column: `sex`.

gmm_8 <- gmm_res

9 clusters

bbdd_umap_50_resid <- bbdd_umap_50_resid_saved
set.seed(12345)
gmm_res <- bbdd_resid %>%
  select(-rowId) %>%
  group_by(sex_female, .add = T) %>%
  group_split() %>%
  map(function(strat){
    scaledat <- strat %>%
      select(-sex_female)
    mod <- GMM(data = scaledat,
               gaussian_comps = 9,
               em_iter = 10)
    pred <- predict_GMM(scaledat,
                        CENTROIDS = mod$centroids,
                        COVARIANCE = mod$covariance_matrices, 
                        WEIGHTS = mod$weights)
    return(list(mod = mod,
                pred = pred,
                grp = strat$sex_female[1]))
  })

gmm_res <- list(gmm_res[[2]], gmm_res[[1]])

bbdd_umap_50_resid <- bbdd_umap_50_resid %>%
  group_by(sex, .add = T) %>%
  group_split() %>%
  map2_dfr(gmm_res,
           ~.x %>%
             bind_cols(cluster_gmm = as.factor(.y$pred$cluster_labels)))


ggplot(bbdd_umap_50_resid) +
  geom_point(aes(x = X1, y = X2, colour = cluster_gmm)) +
  facet_grid(. ~ sex) +
  theme_bw()

gmm_res %>%
    map(
        function(strata){
            data.frame(MaxProb = apply(strata$pred$cluster_proba, 1, max)) %>%
                ggplot(aes(MaxProb)) +
                geom_histogram(bins = 10, fill = "lightblue", color = "black")
        }
    ) %>%
    wrap_plots &
    theme_bw() &
    labs(y = "Count")

gmm_res %>%
    map_dfr(~data.frame(prob80 = apply(.x$pred$cluster_proba, 1, max) > .8), .id = "sex") %>%
    ggplot(aes(sex, fill = prob80)) +
    geom_bar(stat="count", position ="fill") +
    theme_bw() +
    scale_y_continuous(labels = scales::percent) +
    labs(x = NULL, y = "%", fill = "MaxP > 80%")

gmm_res %>%
    map_dfr(~data.frame(cluster = .x$pred$cluster_labels) %>%
                count(cluster, name = "size") %>%
                mutate(prop = size / sum(size),
                       sex = .x$grp)) %>%
    ggplot(aes(cluster, prop * 100)) +
    geom_col(fill = "lightblue", color = "black") +
    facet_wrap(~sex) +
    theme_bw() +
    labs(x = "Cluster", y = "%")

gmm_res %>%
    map2(bbdd %>%
           select(HbA1c, Leukocytes, Monocytes, cLDL, Tg, cHDL, DBP, SBP, Glucose, TimeT2DM, Ferritin,
                  sex_female) %>% 
           group_by(sex_female, .add = T) %>%
           group_split(),
         function(strata, data){
           varnames <- names(select(data, -c(sex_female)))
           centroid_dat <- strata$mod$centroids %>%
             data.frame %>%
             setNames(paste0(varnames, "_mean"))
           sd_dat <- strata$mod$covariance_matrices %>%
             sqrt %>%
             data.frame %>%
             setNames(paste0(varnames, "_sd"))
           cbind(centroid_dat, sd_dat) %>%
             tibble::rownames_to_column("component") %>%
             pivot_longer(-component, names_sep = "_", names_to = c("name", ".value")) %>%
             mutate(sd = qnorm(1 - (0.05/2)) * sd) %>%
             ggplot(aes(mean, component)) +
             geom_vline(xintercept = 0, linetype = "dashed", color = "gray") +
             geom_vline(xintercept = c(-1, 1) * qnorm(1 - .2/2), lty = "dashed", color = "darkred") +
             geom_linerange(aes(xmin = mean - sd, xmax = mean + sd)) +
             geom_point(size = 2) +
             facet_wrap(~name, ncol = 3, scales = "free_x") +
             labs(title = strata$grp, x = NULL, y = NULL) +
             theme_bw()
         }) %>%
  wrap_plots(nrow = 1)

bbdd %>%
  select(age, BMI, HbA1c, Leukocytes, Monocytes, cLDL, Tg, cHDL, DBP, SBP, Glucose, TimeT2DM, Ferritin,
         sex_female) %>%
  group_by(sex_female, .add = T) %>%
  group_split() %>%
  map2(bbdd_umap_50_resid %>%
         group_by(sex_female = sex, .add = T) %>%
         group_split(),
       ~.x %>%
         bind_cols(.y %>% select(-sex_female, -X1, -X2)) %>%
         pivot_longer(-c(cluster_gmm, sex, sex_female)) %>%
         group_by(name) %>%
         mutate(mval = mean(value)) %>%
         ungroup %>%
         mutate(name = factor(name,
                              levels = c('age', 'BMI', 'HbA1c', 'Glucose', 'Leukocytes', 'Monocytes',
                                         'cLDL', 'cHDL', 'Tg', 'DBP', 'SBP', 'TimeT2DM', 'Ferritin'))) %>%
         ggplot(aes(cluster_gmm, value)) +
         geom_boxplot(aes(group = cluster_gmm, colour = cluster_gmm)) +
         geom_hline(aes(yintercept = mval), linetype = "dashed", color = "red") +
         facet_wrap(~name, scales = "free", nrow = 1) +
         theme_bw() +
         theme(legend.position = 'none') +
         labs(title = .x$sex[1], x = NULL, y = NULL)) %>%
  modify_at(2, ~.x + labs(x = "Cluster")) %>%
  wrap_plots(ncol = 1)
## Warning: Unknown or uninitialised column: `sex`.
## Unknown or uninitialised column: `sex`.

gmm_9 <- gmm_res

10 clusters

bbdd_umap_50_resid <- bbdd_umap_50_resid_saved
set.seed(12345)
gmm_res <- bbdd_resid %>%
  select(-rowId) %>%
  group_by(sex_female, .add = T) %>%
  group_split() %>%
  map(function(strat){
    scaledat <- strat %>%
      select(-sex_female)
    mod <- GMM(data = scaledat,
               gaussian_comps = 10,
               em_iter = 10)
    pred <- predict_GMM(scaledat,
                        CENTROIDS = mod$centroids,
                        COVARIANCE = mod$covariance_matrices, 
                        WEIGHTS = mod$weights)
    return(list(mod = mod,
                pred = pred,
                grp = strat$sex_female[1]))
  })

gmm_res <- list(gmm_res[[2]], gmm_res[[1]])

bbdd_umap_50_resid <- bbdd_umap_50_resid %>%
  group_by(sex, .add = T) %>%
  group_split() %>%
  map2_dfr(gmm_res,
           ~.x %>%
             bind_cols(cluster_gmm = as.factor(.y$pred$cluster_labels)))


ggplot(bbdd_umap_50_resid) +
  geom_point(aes(x = X1, y = X2, colour = cluster_gmm)) +
  facet_grid(. ~ sex) +
  theme_bw()

gmm_res %>%
    map(
        function(strata){
            data.frame(MaxProb = apply(strata$pred$cluster_proba, 1, max)) %>%
                ggplot(aes(MaxProb)) +
                geom_histogram(bins = 10, fill = "lightblue", color = "black")
        }
    ) %>%
    wrap_plots &
    theme_bw() &
    labs(y = "Count")

gmm_res %>%
    map_dfr(~data.frame(prob80 = apply(.x$pred$cluster_proba, 1, max) > .8), .id = "sex") %>%
    ggplot(aes(sex, fill = prob80)) +
    geom_bar(stat="count", position ="fill") +
    theme_bw() +
    scale_y_continuous(labels = scales::percent) +
    labs(x = NULL, y = "%", fill = "MaxP > 80%")

gmm_res %>%
    map_dfr(~data.frame(cluster = .x$pred$cluster_labels) %>%
                count(cluster, name = "size") %>%
                mutate(prop = size / sum(size),
                       sex = .x$grp)) %>%
    ggplot(aes(cluster, prop * 100)) +
    geom_col(fill = "lightblue", color = "black") +
    facet_wrap(~sex) +
    theme_bw() +
    labs(x = "Cluster", y = "%")

gmm_res %>%
    map2(bbdd %>%
           select(HbA1c, Leukocytes, Monocytes, cLDL, Tg, cHDL, DBP, SBP, Glucose, TimeT2DM, Ferritin,
                  sex_female) %>% 
           group_by(sex_female, .add = T) %>%
           group_split(),
         function(strata, data){
           varnames <- names(select(data, -c(sex_female)))
           centroid_dat <- strata$mod$centroids %>%
             data.frame %>%
             setNames(paste0(varnames, "_mean"))
           sd_dat <- strata$mod$covariance_matrices %>%
             sqrt %>%
             data.frame %>%
             setNames(paste0(varnames, "_sd"))
           cbind(centroid_dat, sd_dat) %>%
             tibble::rownames_to_column("component") %>%
             pivot_longer(-component, names_sep = "_", names_to = c("name", ".value")) %>%
             mutate(sd = qnorm(1 - (0.05/2)) * sd) %>%
             ggplot(aes(mean, component)) +
             geom_vline(xintercept = 0, linetype = "dashed", color = "gray") +
             geom_vline(xintercept = c(-1, 1) * qnorm(1 - .2/2), lty = "dashed", color = "darkred") +
             geom_linerange(aes(xmin = mean - sd, xmax = mean + sd)) +
             geom_point(size = 2) +
             facet_wrap(~name, ncol = 3, scales = "free_x") +
             labs(title = strata$grp, x = NULL, y = NULL) +
             theme_bw()
         }) %>%
  wrap_plots(nrow = 1)

bbdd %>%
  select(age, BMI, HbA1c, Leukocytes, Monocytes, cLDL, Tg, cHDL, DBP, SBP, Glucose, TimeT2DM, Ferritin,
         sex_female) %>%
  group_by(sex_female, .add = T) %>%
  group_split() %>%
  map2(bbdd_umap_50_resid %>%
         group_by(sex_female = sex, .add = T) %>%
         group_split(),
       ~.x %>%
         bind_cols(.y %>% select(-sex_female, -X1, -X2)) %>%
         pivot_longer(-c(cluster_gmm, sex, sex_female)) %>%
         group_by(name) %>%
         mutate(mval = mean(value)) %>%
         ungroup %>%
         mutate(name = factor(name,
                              levels = c('age', 'BMI', 'HbA1c', 'Glucose', 'Leukocytes', 'Monocytes',
                                         'cLDL', 'cHDL', 'Tg', 'DBP', 'SBP', 'TimeT2DM', 'Ferritin'))) %>%
         ggplot(aes(cluster_gmm, value)) +
         geom_boxplot(aes(group = cluster_gmm, colour = cluster_gmm)) +
         geom_hline(aes(yintercept = mval), linetype = "dashed", color = "red") +
         facet_wrap(~name, scales = "free", nrow = 1) +
         theme_bw() +
         theme(legend.position = 'none') +
         labs(title = .x$sex[1], x = NULL, y = NULL)) %>%
  modify_at(2, ~.x + labs(x = "Cluster")) %>%
  wrap_plots(ncol = 1)
## Warning: Unknown or uninitialised column: `sex`.
## Unknown or uninitialised column: `sex`.

gmm10 <- gmm_res

Análisis de supervivencia

create_bbdd_clusters <- function(gmm_res){
gmm_res <- list(gmm_res[[2]], gmm_res[[1]])
bbdd_resid_sex <- bbdd_resid %>% group_by(sex_female) %>% group_split()
bbdd_resid_clusters <- list()
bbdd_resid_clusters[[2]] <- cbind(bbdd_resid_sex[[1]], gmm_res[[1]]$pred$cluster_labels)
bbdd_resid_clusters[[1]] <- cbind(bbdd_resid_sex[[2]], gmm_res[[2]]$pred$cluster_labels)

bbdd_covar_sex <- bbdd_covar %>% group_by(sex_female) %>% group_split()

bbdd_num <- list()
bbdd_num[[1]] <- bbdd_covar_sex[[1]][which(bbdd_covar_sex[[1]]$rowId %in% bbdd_resid_clusters[[1]]$rowId),]
bbdd_num[[2]] <- bbdd_covar_sex[[2]][which(bbdd_covar_sex[[2]]$rowId %in% bbdd_resid_clusters[[2]]$rowId),]

n <- colnames(bbdd_num[[1]])

bbdd_clusters <- list()
bbdd_clusters[[1]] <- cbind(bbdd_num[[1]], gmm_res[[2]]$pred$cluster_labels)
bbdd_clusters[[2]] <- cbind(bbdd_num[[2]], gmm_res[[1]]$pred$cluster_labels)

colnames(bbdd_clusters[[1]]) <- c(n, "clusters")
colnames(bbdd_clusters[[2]]) <- c(n, "clusters")

bbdd_clusters[[1]] <- bbdd_clusters[[1]] %>% filter(ami == 0, stroke == 0, angor==0, tia == 0)
bbdd_clusters[[2]] <- bbdd_clusters[[2]] %>% filter(ami == 0, stroke == 0, angor==0, tia == 0)

bbdd_clusters <- rbind(bbdd_clusters[[1]], bbdd_clusters[[2]])
bbdd_clusters <- bbdd_clusters %>% select(rowId, age, BMI, HbA1c, Leukocytes, Monocytes, cLDL, 
                                                sex_female, Tg, cHDL, DBP, SBP, Glucose, TimeT2DM, Ferritin, clusters,
                                                t.ep_StrokeI, i.ep_StrokeI, stroke, t.ep_TIA, i.ep_TIA, tia,
                                                t.ep_Angor, i.ep_Angor, angor, t.ep_AMI, i.ep_AMI, ami)

bbdd_clusters$clusters <- as.factor(bbdd_clusters$clusters)

return(bbdd_clusters)
}

6 clusters

bbdd_clusters_6 <- create_bbdd_clusters(gmm_6)
head(bbdd_clusters_6)
##   rowId age   BMI HbA1c Leukocytes Monocytes cLDL sex_female  Tg cHDL DBP SBP
## 1  3785  61 19.27   6.5        6.6       8.8  123        Men  87   75  80 150
## 2  4678  67 24.71   7.7        9.5       5.4   83        Men  62   60  60 110
## 3  6049  56 34.32   7.9        7.3       5.6  110        Men  74   52  84 123
## 4 25754  70 30.11  10.2        8.1       7.0  220        Men 168   47  72 130
## 5 27214  71 24.91   5.7        3.5       7.1  122        Men 214   58  60 120
## 6 30046  71 33.96   6.5        6.6      11.5  107        Men 160   48  75 145
##   Glucose  TimeT2DM Ferritin clusters t.ep_StrokeI i.ep_StrokeI stroke t.ep_TIA
## 1     136 10.023272    100.7        3         4199            0      0     4199
## 2     227  8.996578    154.8        5         3246            0      0     3246
## 3     150  0.000000    297.9        1          481            0      0      481
## 4     278  0.000000    165.0        6         1411            0      0     1411
## 5     109  4.818617    279.1        4         4199            0      0     4199
## 6     105  0.000000    579.0        4         2352            0      0     2352
##   i.ep_TIA tia t.ep_Angor i.ep_Angor angor t.ep_AMI i.ep_AMI ami
## 1        0   0       4199          0     0     4199        0   0
## 2        0   0       3246          0     0     3246        0   0
## 3        0   0        481          0     0      481        0   0
## 4        0   0       1411          0     0     1411        0   0
## 5        0   0       4199          0     0     4199        0   0
## 6        0   0       2352          0     0     2352        0   0

STROKE

bbdd_clusters_6 %>% group_by(sex_female) %>% group_split %>% map(function(data){
  autoplot(survfit(Surv(t.ep_StrokeI, i.ep_StrokeI) ~ clusters, data=data), main="Stroke survival", censor.size = 0, surv.geom = "line",
           conf.int.fill=NULL)
})
## [[1]]

## 
## [[2]]

summary(as.factor(bbdd_clusters_6$clusters))
##   1   2   3   4   5   6 
## 310 279 438 490 466 231
bbdd_clusters_6$clusters <- as.factor(bbdd_clusters_6$clusters)

clusters_men <- bbdd_clusters_6 %>% filter(sex_female=="Men") #FILTRO CUANDO NO APARECE EVENTO
clusters_men$clusters <- relevel(clusters_men$clusters, 4) #REFERENCIA MAYOR SUPERVIVENCIA

clusters_women <- bbdd_clusters_6 %>% filter(sex_female=="Women")
clusters_women$clusters <- relevel(clusters_women$clusters, ref = 2)

clusters = list(clusters_men, clusters_women)

bbdd_clusters_6_surv <- clusters %>%
  map(function(data) {mod = coxph(Surv(t.ep_StrokeI, i.ep_StrokeI)~age + BMI+ clusters, data=data)
      tidy(mod, conf.int= TRUE, exponentiate=TRUE)})
head(bbdd_clusters_6_surv)
## [[1]]
## # A tibble: 7 x 7
##   term      estimate std.error statistic   p.value conf.low conf.high
##   <chr>        <dbl>     <dbl>     <dbl>     <dbl>    <dbl>     <dbl>
## 1 age          1.07     0.0164    4.38   0.0000119    1.04       1.11
## 2 BMI          0.996    0.0373   -0.0995 0.921        0.926      1.07
## 3 clusters1    4.48     0.765     1.96   0.0498       1.00      20.0 
## 4 clusters2    2.56     0.819     1.15   0.251        0.515     12.7 
## 5 clusters3    2.24     0.765     1.06   0.290        0.502     10.0 
## 6 clusters5    2.30     0.802     1.04   0.299        0.478     11.1 
## 7 clusters6    3.24     0.914     1.29   0.198        0.541     19.4 
## 
## [[2]]
## # A tibble: 7 x 7
##   term      estimate std.error statistic p.value conf.low conf.high
##   <chr>        <dbl>     <dbl>     <dbl>   <dbl>    <dbl>     <dbl>
## 1 age          1.04     0.0133    3.29   0.00101    1.02       1.07
## 2 BMI          0.956    0.0254   -1.79   0.0739     0.909      1.00
## 3 clusters1    1.77     0.518     1.10   0.270      0.642      4.89
## 4 clusters3    1.11     0.579     0.176  0.861      0.356      3.44
## 5 clusters4    1.01     0.495     0.0301 0.976      0.385      2.68
## 6 clusters5    1.57     0.467     0.969  0.333      0.630      3.92
## 7 clusters6    1.11     0.580     0.178  0.858      0.356      3.45
options(repr.plot.width = 9, repr.plot.height = 3)
bbdd_clusters_6_surv[[1]] %>%
    ggplot(aes(estimate, term)) +
    geom_vline(xintercept = 1, lty = "dashed") +
    geom_linerange(aes(xmin = conf.low, xmax = conf.high)) +
    geom_point(aes(color = term)) +
    theme_bw() +
    theme(legend.position = "none") +
    xlim(-1,6) +
    labs(x = "Hazard ratio")
## Warning: Removed 5 rows containing missing values (geom_segment).

options(repr.plot.width = 9, repr.plot.height = 3)
bbdd_clusters_6_surv[[2]] %>%
    ggplot(aes(estimate, term)) +
    geom_vline(xintercept = 1, lty = "dashed") +
    geom_linerange(aes(xmin = conf.low, xmax = conf.high)) +
    geom_point(aes(color = term)) +
    theme_bw() +
    theme(legend.position = "none") +
    xlim(-1, 6) +
    labs(x = "Hazard ratio")

TIA

bbdd_clusters_6 %>% group_by(sex_female) %>% group_split %>% map(function(data){
  autoplot(survfit(Surv(t.ep_TIA, i.ep_TIA) ~ clusters, data=data), main="TIA survival", censor.size = 0, surv.geom = "line",
           conf.int.fill=NULL)
})
## [[1]]

## 
## [[2]]

summary(as.factor(bbdd_clusters_6$clusters))
##   1   2   3   4   5   6 
## 310 279 438 490 466 231
bbdd_clusters_6$clusters <- as.factor(bbdd_clusters_6$clusters)

clusters_men <- bbdd_clusters_6 %>% filter(sex_female=="Men") #FILTRO CUANDO NO APARECE EVENTO
clusters_men$clusters <- relevel(clusters_men$clusters, 5) #REFERENCIA MAYOR SUPERVIVENCIA

clusters_women <- bbdd_clusters_6 %>% filter(sex_female=="Women")
clusters_women$clusters <- relevel(clusters_women$clusters, ref = 6)

clusters = list(clusters_men, clusters_women)

bbdd_clusters_6_surv <- clusters %>%
  map(function(data) {mod = coxph(Surv(t.ep_TIA, i.ep_TIA)~age + BMI+ clusters, data=data)
      tidy(mod, conf.int= TRUE, exponentiate=TRUE)})
head(bbdd_clusters_6_surv)
## [[1]]
## # A tibble: 7 x 7
##   term      estimate std.error statistic p.value conf.low conf.high
##   <chr>        <dbl>     <dbl>     <dbl>   <dbl>    <dbl>     <dbl>
## 1 age          1.05     0.0218     2.13   0.0328    1.00       1.09
## 2 BMI          0.966    0.0538    -0.634  0.526     0.870      1.07
## 3 clusters1    4.45     1.12       1.33   0.182     0.497     39.9 
## 4 clusters2    4.15     1.16       1.23   0.218     0.431     40.0 
## 5 clusters3    4.62     1.06       1.44   0.149     0.578     37.0 
## 6 clusters4    6.15     1.12       1.62   0.104     0.687     55.1 
## 7 clusters6    3.09     1.42       0.797  0.426     0.193     49.6 
## 
## [[2]]
## # A tibble: 7 x 7
##   term      estimate std.error statistic p.value conf.low conf.high
##   <chr>        <dbl>     <dbl>     <dbl>   <dbl>    <dbl>     <dbl>
## 1 age          1.04     0.0145     2.64  0.00825    1.01       1.07
## 2 BMI          0.975    0.0275    -0.916 0.360      0.924      1.03
## 3 clusters1    1.45     0.647      0.578 0.563      0.409      5.16
## 4 clusters2    1.34     0.648      0.454 0.650      0.377      4.78
## 5 clusters3    1.74     0.629      0.876 0.381      0.506      5.95
## 6 clusters4    1.61     0.563      0.848 0.397      0.534      4.86
## 7 clusters5    1.10     0.593      0.168 0.867      0.346      3.53
options(repr.plot.width = 9, repr.plot.height = 3)
bbdd_clusters_6_surv[[1]] %>%
    ggplot(aes(estimate, term)) +
    geom_vline(xintercept = 1, lty = "dashed") +
    geom_linerange(aes(xmin = conf.low, xmax = conf.high)) +
    geom_point(aes(color = term)) +
    theme_bw() +
    theme(legend.position = "none") +
    xlim(-1,8) +
    labs(x = "Hazard ratio")
## Warning: Removed 5 rows containing missing values (geom_segment).

options(repr.plot.width = 9, repr.plot.height = 3)
bbdd_clusters_6_surv[[2]] %>%
    ggplot(aes(estimate, term)) +
    geom_vline(xintercept = 1, lty = "dashed") +
    geom_linerange(aes(xmin = conf.low, xmax = conf.high)) +
    geom_point(aes(color = term)) +
    theme_bw() +
    theme(legend.position = "none") +
    xlim(-1, 8) +
    labs(x = "Hazard ratio")

AMI

bbdd_clusters_6 %>% group_by(sex_female) %>% group_split %>% map(function(data){
  autoplot(survfit(Surv(t.ep_AMI, i.ep_AMI) ~ clusters, data=data), main="AMI survival", censor.size = 0, surv.geom = "line",
           conf.int.fill=NULL)
})
## [[1]]

## 
## [[2]]

summary(as.factor(bbdd_clusters_6$clusters))
##   1   2   3   4   5   6 
## 310 279 438 490 466 231
bbdd_clusters_6$clusters <- as.factor(bbdd_clusters_6$clusters)

clusters_men <- bbdd_clusters_6 %>% filter(clusters != 6, sex_female=="Men") #FILTRO CUANDO NO APARECE EVENTO
clusters_men$clusters <- relevel(clusters_men$clusters, 3) #REFERENCIA MAYOR SUPERVIVENCIA

clusters_women <- bbdd_clusters_6 %>% filter(clusters != 4, sex_female=="Women")
clusters_women$clusters <- relevel(clusters_women$clusters, ref = 6)

clusters = list(clusters_men, clusters_women)

bbdd_clusters_6_surv <- clusters %>%
  map(function(data) {mod = coxph(Surv(t.ep_AMI, i.ep_AMI)~age + BMI+ clusters, data=data)
      tidy(mod, conf.int= TRUE, exponentiate=TRUE)})
head(bbdd_clusters_6_surv)
## [[1]]
## # A tibble: 7 x 7
##   term      estimate std.error statistic  p.value conf.low conf.high
##   <chr>        <dbl>     <dbl>     <dbl>    <dbl>    <dbl>     <dbl>
## 1 age          1.03     0.0184     1.75   0.0809     0.996      1.07
## 2 BMI          0.942    0.0468    -1.27   0.203      0.860      1.03
## 3 clusters1    2.50     0.606      1.51   0.132      0.760      8.19
## 4 clusters2    4.90     0.549      2.90   0.00379    1.67      14.4 
## 5 clusters4    3.31     0.607      1.97   0.0484     1.01      10.9 
## 6 clusters5    1.09     0.731      0.112  0.911      0.259      4.55
## 7 clusters6   NA        0         NA     NA         NA         NA   
## 
## [[2]]
## # A tibble: 7 x 7
##   term      estimate std.error statistic p.value conf.low conf.high
##   <chr>        <dbl>     <dbl>     <dbl>   <dbl>    <dbl>     <dbl>
## 1 age          1.05     0.0258    1.71    0.0874    0.994      1.10
## 2 BMI          1.01     0.0439    0.251   0.802     0.928      1.10
## 3 clusters1    0.990    1.00     -0.0104  0.992     0.139      7.07
## 4 clusters2    2.31     0.842     0.992   0.321     0.443     12.0 
## 5 clusters3    1.06     1.00      0.0594  0.953     0.148      7.60
## 6 clusters4   NA        0        NA      NA        NA         NA   
## 7 clusters5    1.29     0.819     0.308   0.758     0.258      6.41
options(repr.plot.width = 9, repr.plot.height = 3)
bbdd_clusters_6_surv[[1]] %>%
    ggplot(aes(estimate, term)) +
    geom_vline(xintercept = 1, lty = "dashed") +
    geom_linerange(aes(xmin = conf.low, xmax = conf.high)) +
    geom_point(aes(color = term)) +
    theme_bw() +
    theme(legend.position = "none") +
    xlim(-1,6) +
    labs(x = "Hazard ratio")
## Warning: Removed 4 rows containing missing values (geom_segment).
## Warning: Removed 1 rows containing missing values (geom_point).

options(repr.plot.width = 9, repr.plot.height = 3)
bbdd_clusters_6_surv[[2]] %>%
    ggplot(aes(estimate, term)) +
    geom_vline(xintercept = 1, lty = "dashed") +
    geom_linerange(aes(xmin = conf.low, xmax = conf.high)) +
    geom_point(aes(color = term)) +
    theme_bw() +
    theme(legend.position = "none") +
    xlim(-1, 6) +
    labs(x = "Hazard ratio")
## Warning: Removed 5 rows containing missing values (geom_segment).
## Removed 1 rows containing missing values (geom_point).

ANGOR

bbdd_clusters_6 %>% group_by(sex_female) %>% group_split %>% map(function(data){
  autoplot(survfit(Surv(t.ep_Angor, i.ep_Angor) ~ clusters, data=data), main="Angor survival", censor.size = 0, surv.geom = "line",
           conf.int.fill=NULL)
})
## [[1]]

## 
## [[2]]

summary(as.factor(bbdd_clusters_6$clusters))
##   1   2   3   4   5   6 
## 310 279 438 490 466 231
bbdd_clusters_6$clusters <- as.factor(bbdd_clusters_6$clusters)

clusters_men <- bbdd_clusters_6 %>% filter(sex_female=="Men") #FILTRO CUANDO NO APARECE EVENTO
clusters_men$clusters <- relevel(clusters_men$clusters,6) #REFERENCIA MAYOR SUPERVIVENCIA

clusters_women <- bbdd_clusters_6 %>% filter(sex_female=="Women")
clusters_women$clusters <- relevel(clusters_women$clusters, ref = 6)

clusters = list(clusters_men, clusters_women)

bbdd_clusters_6_surv <- clusters %>%
  map(function(data) {mod = coxph(Surv(t.ep_Angor, i.ep_Angor)~age + BMI+ clusters, data=data)
      tidy(mod, conf.int= TRUE, exponentiate=TRUE)})
head(bbdd_clusters_6_surv)
## [[1]]
## # A tibble: 7 x 7
##   term      estimate std.error statistic p.value conf.low conf.high
##   <chr>        <dbl>     <dbl>     <dbl>   <dbl>    <dbl>     <dbl>
## 1 age           1.03    0.0167     1.59   0.112     0.994      1.06
## 2 BMI           1.08    0.0344     2.30   0.0212    1.01       1.16
## 3 clusters1     1.71    1.12       0.479  0.632     0.189     15.5 
## 4 clusters2     5.15    1.07       1.53   0.126     0.631     42.0 
## 5 clusters3     1.94    1.07       0.621  0.535     0.240     15.7 
## 6 clusters4     3.39    1.08       1.13   0.258     0.408     28.3 
## 7 clusters5     2.07    1.10       0.662  0.508     0.240     17.8 
## 
## [[2]]
## # A tibble: 7 x 7
##   term      estimate std.error statistic p.value conf.low conf.high
##   <chr>        <dbl>     <dbl>     <dbl>   <dbl>    <dbl>     <dbl>
## 1 age          1.01     0.0189     0.781   0.435    0.978      1.05
## 2 BMI          0.970    0.0371    -0.817   0.414    0.902      1.04
## 3 clusters1    3.14     0.818      1.40    0.161    0.633     15.6 
## 4 clusters2    2.02     0.870      0.810   0.418    0.368     11.1 
## 5 clusters3    1.11     1.00       0.101   0.919    0.155      7.89
## 6 clusters4    1.39     0.817      0.402   0.687    0.280      6.90
## 7 clusters5    1.17     0.838      0.189   0.850    0.227      6.06
options(repr.plot.width = 9, repr.plot.height = 3)
bbdd_clusters_6_surv[[1]] %>%
    ggplot(aes(estimate, term)) +
    geom_vline(xintercept = 1, lty = "dashed") +
    geom_linerange(aes(xmin = conf.low, xmax = conf.high)) +
    geom_point(aes(color = term)) +
    theme_bw() +
    theme(legend.position = "none") +
    xlim(-1,6) +
    labs(x = "Hazard ratio")
## Warning: Removed 5 rows containing missing values (geom_segment).

options(repr.plot.width = 9, repr.plot.height = 3)
bbdd_clusters_6_surv[[2]] %>%
    ggplot(aes(estimate, term)) +
    geom_vline(xintercept = 1, lty = "dashed") +
    geom_linerange(aes(xmin = conf.low, xmax = conf.high)) +
    geom_point(aes(color = term)) +
    theme_bw() +
    theme(legend.position = "none") +
    xlim(-1, 6) +
    labs(x = "Hazard ratio")
## Warning: Removed 5 rows containing missing values (geom_segment).

7 clusters

bbdd_clusters_7 <- create_bbdd_clusters(gmm_7)
head(bbdd_clusters_7)
##   rowId age   BMI HbA1c Leukocytes Monocytes cLDL sex_female  Tg cHDL DBP SBP
## 1  3785  61 19.27   6.5        6.6       8.8  123        Men  87   75  80 150
## 2  4678  67 24.71   7.7        9.5       5.4   83        Men  62   60  60 110
## 3  6049  56 34.32   7.9        7.3       5.6  110        Men  74   52  84 123
## 4 25754  70 30.11  10.2        8.1       7.0  220        Men 168   47  72 130
## 5 27214  71 24.91   5.7        3.5       7.1  122        Men 214   58  60 120
## 6 30046  71 33.96   6.5        6.6      11.5  107        Men 160   48  75 145
##   Glucose  TimeT2DM Ferritin clusters t.ep_StrokeI i.ep_StrokeI stroke t.ep_TIA
## 1     136 10.023272    100.7        2         4199            0      0     4199
## 2     227  8.996578    154.8        1         3246            0      0     3246
## 3     150  0.000000    297.9        5          481            0      0      481
## 4     278  0.000000    165.0        6         1411            0      0     1411
## 5     109  4.818617    279.1        1         4199            0      0     4199
## 6     105  0.000000    579.0        4         2352            0      0     2352
##   i.ep_TIA tia t.ep_Angor i.ep_Angor angor t.ep_AMI i.ep_AMI ami
## 1        0   0       4199          0     0     4199        0   0
## 2        0   0       3246          0     0     3246        0   0
## 3        0   0        481          0     0      481        0   0
## 4        0   0       1411          0     0     1411        0   0
## 5        0   0       4199          0     0     4199        0   0
## 6        0   0       2352          0     0     2352        0   0

STROKE

bbdd_clusters_7 %>% group_by(sex_female) %>% group_split %>% map(function(data){
  autoplot(survfit(Surv(t.ep_StrokeI, i.ep_StrokeI) ~ clusters, data=data), main="Stroke survival", censor.size = 0, surv.geom = "line",
           conf.int.fill=NULL)
})
## [[1]]

## 
## [[2]]

summary(as.factor(bbdd_clusters_7$clusters))
##   1   2   3   4   5   6   7 
## 341 277 350 355 446 213 232
bbdd_clusters_7$clusters <- as.factor(bbdd_clusters_7$clusters)

clusters_men <- bbdd_clusters_7 %>% filter(sex_female=="Men") #FILTRO CUANDO NO APARECE EVENTO
clusters_men$clusters <- relevel(clusters_men$clusters, 2) #REFERENCIA MAYOR SUPERVIVENCIA

clusters_women <- bbdd_clusters_7 %>% filter(sex_female=="Women")
clusters_women$clusters <- relevel(clusters_women$clusters, ref = 1)

clusters = list(clusters_men, clusters_women)

bbdd_clusters_7_surv <- clusters %>%
  map(function(data) {mod = coxph(Surv(t.ep_StrokeI, i.ep_StrokeI)~age + BMI+ clusters, data=data)
      tidy(mod, conf.int= TRUE, exponentiate=TRUE)})
head(bbdd_clusters_7_surv)
## [[1]]
## # A tibble: 8 x 7
##   term      estimate std.error statistic    p.value conf.low conf.high
##   <chr>        <dbl>     <dbl>     <dbl>      <dbl>    <dbl>     <dbl>
## 1 age           1.08    0.0166    4.52   0.00000617    1.04       1.11
## 2 BMI           1.00    0.0375    0.0852 0.932         0.932      1.08
## 3 clusters1     2.19    0.691     1.13   0.257         0.565      8.49
## 4 clusters3     1.43    0.660     0.542  0.588         0.392      5.21
## 5 clusters4     1.37    0.818     0.387  0.699         0.276      6.82
## 6 clusters5     2.18    0.669     1.16   0.244         0.588      8.08
## 7 clusters6     2.54    0.768     1.22   0.224         0.565     11.4 
## 8 clusters7     1.27    0.707     0.334  0.738         0.317      5.07
## 
## [[2]]
## # A tibble: 8 x 7
##   term      estimate std.error statistic  p.value conf.low conf.high
##   <chr>        <dbl>     <dbl>     <dbl>    <dbl>    <dbl>     <dbl>
## 1 age          1.05     0.0133     3.35  0.000820    1.02       1.07
## 2 BMI          0.953    0.0259    -1.86  0.0634      0.906      1.00
## 3 clusters2    1.36     0.488      0.633 0.527       0.523      3.54
## 4 clusters3    2.94     0.476      2.26  0.0236      1.16       7.46
## 5 clusters4    0.948    0.487     -0.110 0.912       0.365      2.46
## 6 clusters5    1.11     0.452      0.241 0.810       0.460      2.70
## 7 clusters6    1.23     0.541      0.388 0.698       0.427      3.56
## 8 clusters7    1.67     0.540      0.944 0.345       0.577      4.80
options(repr.plot.width = 9, repr.plot.height = 3)
bbdd_clusters_7_surv[[1]] %>%
    ggplot(aes(estimate, term)) +
    geom_vline(xintercept = 1, lty = "dashed") +
    geom_linerange(aes(xmin = conf.low, xmax = conf.high)) +
    geom_point(aes(color = term)) +
    theme_bw() +
    theme(legend.position = "none") +
    xlim(-1,6) +
    labs(x = "Hazard ratio")
## Warning: Removed 4 rows containing missing values (geom_segment).

options(repr.plot.width = 9, repr.plot.height = 3)
bbdd_clusters_7_surv[[2]] %>%
    ggplot(aes(estimate, term)) +
    geom_vline(xintercept = 1, lty = "dashed") +
    geom_linerange(aes(xmin = conf.low, xmax = conf.high)) +
    geom_point(aes(color = term)) +
    theme_bw() +
    theme(legend.position = "none") +
    xlim(-1, 6) +
    labs(x = "Hazard ratio")
## Warning: Removed 1 rows containing missing values (geom_segment).

TIA

bbdd_clusters_7 %>% group_by(sex_female) %>% group_split %>% map(function(data){
  autoplot(survfit(Surv(t.ep_TIA, i.ep_TIA) ~ clusters, data=data), main="TIA survival", censor.size = 0, surv.geom = "line",
           conf.int.fill=NULL)
})
## [[1]]

## 
## [[2]]

summary(as.factor(bbdd_clusters_7$clusters))
##   1   2   3   4   5   6   7 
## 341 277 350 355 446 213 232
bbdd_clusters_7$clusters <- as.factor(bbdd_clusters_7$clusters)

clusters_men <- bbdd_clusters_7 %>% filter(clusters !=2, sex_female=="Men") #FILTRO CUANDO NO APARECE EVENTO
clusters_men$clusters <- relevel(clusters_men$clusters, 6) #REFERENCIA MAYOR SUPERVIVENCIA

clusters_women <- bbdd_clusters_7 %>% filter(sex_female=="Women")
clusters_women$clusters <- relevel(clusters_women$clusters, ref = 6)

clusters = list(clusters_men, clusters_women)

bbdd_clusters_7_surv <- clusters %>%
  map(function(data) {mod = coxph(Surv(t.ep_TIA, i.ep_TIA)~age + BMI+ clusters, data=data)
      tidy(mod, conf.int= TRUE, exponentiate=TRUE)})
head(bbdd_clusters_7_surv)
## [[1]]
## # A tibble: 8 x 7
##   term      estimate std.error statistic p.value conf.low conf.high
##   <chr>        <dbl>     <dbl>     <dbl>   <dbl>    <dbl>     <dbl>
## 1 age          1.05     0.0222    2.06    0.0394   1.00        1.09
## 2 BMI          0.967    0.0547   -0.612   0.540    0.869       1.08
## 3 clusters1    1.07     1.23      0.0580  0.954    0.0964     12.0 
## 4 clusters2   NA        0        NA      NA       NA          NA   
## 5 clusters3    1.44     1.08      0.338   0.735    0.173      12.0 
## 6 clusters4    1.47     1.23      0.314   0.754    0.133      16.2 
## 7 clusters5    1.60     1.12      0.423   0.673    0.179      14.4 
## 8 clusters7    2.28     1.09      0.759   0.448    0.271      19.2 
## 
## [[2]]
## # A tibble: 8 x 7
##   term      estimate std.error statistic p.value conf.low conf.high
##   <chr>        <dbl>     <dbl>     <dbl>   <dbl>    <dbl>     <dbl>
## 1 age          1.04     0.0146     2.66  0.00784    1.01       1.07
## 2 BMI          0.975    0.0278    -0.904 0.366      0.923      1.03
## 3 clusters1    2.30     0.652      1.28  0.201      0.641      8.27
## 4 clusters2    2.02     0.678      1.04  0.298      0.536      7.65
## 5 clusters3    3.30     0.690      1.73  0.0835     0.854     12.8 
## 6 clusters4    1.39     0.678      0.485 0.628      0.368      5.24
## 7 clusters5    1.34     0.667      0.436 0.663      0.362      4.94
## 8 clusters7    0.891    0.914     -0.126 0.900      0.149      5.34
options(repr.plot.width = 9, repr.plot.height = 3)
bbdd_clusters_7_surv[[1]] %>%
    ggplot(aes(estimate, term)) +
    geom_vline(xintercept = 1, lty = "dashed") +
    geom_linerange(aes(xmin = conf.low, xmax = conf.high)) +
    geom_point(aes(color = term)) +
    theme_bw() +
    theme(legend.position = "none") +
    xlim(-1,6) +
    labs(x = "Hazard ratio")
## Warning: Removed 6 rows containing missing values (geom_segment).
## Warning: Removed 1 rows containing missing values (geom_point).

options(repr.plot.width = 9, repr.plot.height = 3)
bbdd_clusters_7_surv[[2]] %>%
    ggplot(aes(estimate, term)) +
    geom_vline(xintercept = 1, lty = "dashed") +
    geom_linerange(aes(xmin = conf.low, xmax = conf.high)) +
    geom_point(aes(color = term)) +
    theme_bw() +
    theme(legend.position = "none") +
    xlim(-1, 6) +
    labs(x = "Hazard ratio")
## Warning: Removed 3 rows containing missing values (geom_segment).

AMI

bbdd_clusters_7 %>% group_by(sex_female) %>% group_split %>% map(function(data){
  autoplot(survfit(Surv(t.ep_AMI, i.ep_AMI) ~ clusters, data=data), main="AMI survival", censor.size = 0, surv.geom = "line",
           conf.int.fill=NULL)
})
## [[1]]

## 
## [[2]]

summary(as.factor(bbdd_clusters_7$clusters))
##   1   2   3   4   5   6   7 
## 341 277 350 355 446 213 232
bbdd_clusters_7$clusters <- as.factor(bbdd_clusters_7$clusters)

clusters_men <- bbdd_clusters_7 %>% filter(clusters != 6, sex_female=="Men") #FILTRO CUANDO NO APARECE EVENTO
clusters_men$clusters <- relevel(clusters_men$clusters, 3) #REFERENCIA MAYOR SUPERVIVENCIA

clusters_women <- bbdd_clusters_7 %>% filter(clusters !=4, sex_female=="Women")
clusters_women$clusters <- relevel(clusters_women$clusters, ref = 6)

clusters = list(clusters_men, clusters_women)

bbdd_clusters_7_surv <- clusters %>%
  map(function(data) {mod = coxph(Surv(t.ep_AMI, i.ep_AMI)~age + BMI+ clusters, data=data)
      tidy(mod, conf.int= TRUE, exponentiate=TRUE)})
head(bbdd_clusters_7_surv)
## [[1]]
## # A tibble: 8 x 7
##   term      estimate std.error statistic p.value conf.low conf.high
##   <chr>        <dbl>     <dbl>     <dbl>   <dbl>    <dbl>     <dbl>
## 1 age          1.03     0.0183      1.79  0.0742    0.997      1.07
## 2 BMI          0.942    0.0458     -1.30  0.194     0.861      1.03
## 3 clusters1    3.24     0.766       1.54  0.124     0.724     14.5 
## 4 clusters2    5.52     0.709       2.41  0.0160    1.38      22.1 
## 5 clusters4    5.78     0.731       2.40  0.0164    1.38      24.2 
## 6 clusters5    3.63     0.708       1.82  0.0685    0.907     14.5 
## 7 clusters6   NA        0          NA    NA        NA         NA   
## 8 clusters7    3.50     0.712       1.76  0.0782    0.868     14.1 
## 
## [[2]]
## # A tibble: 8 x 7
##   term      estimate std.error statistic p.value conf.low conf.high
##   <chr>        <dbl>     <dbl>     <dbl>   <dbl>    <dbl>     <dbl>
## 1 age           1.05    0.0260     1.81   0.0695    0.996      1.10
## 2 BMI           1.02    0.0456     0.339  0.734     0.929      1.11
## 3 clusters1     1.33    1.23       0.230  0.818     0.120     14.7 
## 4 clusters2     3.74    1.10       1.20   0.229     0.436     32.2 
## 5 clusters3     4.24    1.16       1.25   0.211     0.440     40.8 
## 6 clusters4    NA       0         NA     NA        NA         NA   
## 7 clusters5     1.68    1.12       0.465  0.642     0.188     15.1 
## 8 clusters7     2.83    1.23       0.848  0.396     0.256     31.3
options(repr.plot.width = 9, repr.plot.height = 3)
bbdd_clusters_7_surv[[1]] %>%
    ggplot(aes(estimate, term)) +
    geom_vline(xintercept = 1, lty = "dashed") +
    geom_linerange(aes(xmin = conf.low, xmax = conf.high)) +
    geom_point(aes(color = term)) +
    theme_bw() +
    theme(legend.position = "none") +
    xlim(-1,6) +
    labs(x = "Hazard ratio")
## Warning: Removed 6 rows containing missing values (geom_segment).
## Warning: Removed 1 rows containing missing values (geom_point).

options(repr.plot.width = 9, repr.plot.height = 3)
bbdd_clusters_7_surv[[2]] %>%
    ggplot(aes(estimate, term)) +
    geom_vline(xintercept = 1, lty = "dashed") +
    geom_linerange(aes(xmin = conf.low, xmax = conf.high)) +
    geom_point(aes(color = term)) +
    theme_bw() +
    theme(legend.position = "none") +
    xlim(-1, 6) +
    labs(x = "Hazard ratio")
## Warning: Removed 6 rows containing missing values (geom_segment).
## Removed 1 rows containing missing values (geom_point).

ANGOR

bbdd_clusters_7 %>% group_by(sex_female) %>% group_split %>% map(function(data){
  autoplot(survfit(Surv(t.ep_Angor, i.ep_Angor) ~ clusters, data=data), main="Angor survival", censor.size = 0, surv.geom = "line",
           conf.int.fill=NULL)
})
## [[1]]

## 
## [[2]]

summary(as.factor(bbdd_clusters_7$clusters))
##   1   2   3   4   5   6   7 
## 341 277 350 355 446 213 232
bbdd_clusters_7$clusters <- as.factor(bbdd_clusters_7$clusters)

clusters_men <- bbdd_clusters_7 %>% filter(sex_female=="Men") #FILTRO CUANDO NO APARECE EVENTO
clusters_men$clusters <- relevel(clusters_men$clusters, 1) #REFERENCIA MAYOR SUPERVIVENCIA

clusters_women <- bbdd_clusters_7 %>% filter(sex_female=="Women")
clusters_women$clusters <- relevel(clusters_women$clusters, ref = 4)

clusters = list(clusters_men, clusters_women)

bbdd_clusters_7_surv <- clusters %>%
  map(function(data) {mod = coxph(Surv(t.ep_Angor, i.ep_Angor)~age + BMI+ clusters, data=data)
      tidy(mod, conf.int= TRUE, exponentiate=TRUE)})
head(bbdd_clusters_7_surv)
## [[1]]
## # A tibble: 8 x 7
##   term      estimate std.error statistic p.value conf.low conf.high
##   <chr>        <dbl>     <dbl>     <dbl>   <dbl>    <dbl>     <dbl>
## 1 age           1.03    0.0167     1.72   0.0855    0.996      1.06
## 2 BMI           1.08    0.0344     2.14   0.0322    1.01       1.15
## 3 clusters2     3.32    0.818      1.47   0.142     0.668     16.5 
## 4 clusters3     1.84    0.782      0.782  0.434     0.398      8.54
## 5 clusters4     2.40    0.869      1.01   0.313     0.437     13.2 
## 6 clusters5     1.69    0.837      0.627  0.531     0.328      8.72
## 7 clusters6     1.50    1.00       0.406  0.685     0.210     10.7 
## 8 clusters7     1.49    0.868      0.460  0.646     0.272      8.18
## 
## [[2]]
## # A tibble: 8 x 7
##   term      estimate std.error statistic p.value conf.low conf.high
##   <chr>        <dbl>     <dbl>     <dbl>   <dbl>    <dbl>     <dbl>
## 1 age          1.02     0.0192     0.826  0.409     0.978      1.05
## 2 BMI          0.969    0.0376    -0.835  0.403     0.900      1.04
## 3 clusters1    6.11     1.10       1.65   0.0988    0.713     52.3 
## 4 clusters2    5.87     1.12       1.58   0.114     0.655     52.5 
## 5 clusters3   10.3      1.12       2.09   0.0368    1.15      92.6 
## 6 clusters5    5.73     1.07       1.63   0.103     0.705     46.7 
## 7 clusters6    1.82     1.41       0.422  0.673     0.114     29.1 
## 8 clusters7    7.81     1.15       1.78   0.0751    0.812     75.2
options(repr.plot.width = 9, repr.plot.height = 3)
bbdd_clusters_7_surv[[1]] %>%
    ggplot(aes(estimate, term)) +
    geom_vline(xintercept = 1, lty = "dashed") +
    geom_linerange(aes(xmin = conf.low, xmax = conf.high)) +
    geom_point(aes(color = term)) +
    theme_bw() +
    theme(legend.position = "none") +
    xlim(-1,6) +
    labs(x = "Hazard ratio")
## Warning: Removed 6 rows containing missing values (geom_segment).

options(repr.plot.width = 9, repr.plot.height = 3)
bbdd_clusters_7_surv[[2]] %>%
    ggplot(aes(estimate, term)) +
    geom_vline(xintercept = 1, lty = "dashed") +
    geom_linerange(aes(xmin = conf.low, xmax = conf.high)) +
    geom_point(aes(color = term)) +
    theme_bw() +
    theme(legend.position = "none") +
    xlim(-1, 6) +
    labs(x = "Hazard ratio")
## Warning: Removed 6 rows containing missing values (geom_segment).
## Warning: Removed 3 rows containing missing values (geom_point).

8 clusters

bbdd_clusters_8 <- create_bbdd_clusters(gmm_8)
head(bbdd_clusters_8)
##   rowId age   BMI HbA1c Leukocytes Monocytes cLDL sex_female  Tg cHDL DBP SBP
## 1  3785  61 19.27   6.5        6.6       8.8  123        Men  87   75  80 150
## 2  4678  67 24.71   7.7        9.5       5.4   83        Men  62   60  60 110
## 3  6049  56 34.32   7.9        7.3       5.6  110        Men  74   52  84 123
## 4 25754  70 30.11  10.2        8.1       7.0  220        Men 168   47  72 130
## 5 27214  71 24.91   5.7        3.5       7.1  122        Men 214   58  60 120
## 6 30046  71 33.96   6.5        6.6      11.5  107        Men 160   48  75 145
##   Glucose  TimeT2DM Ferritin clusters t.ep_StrokeI i.ep_StrokeI stroke t.ep_TIA
## 1     136 10.023272    100.7        7         4199            0      0     4199
## 2     227  8.996578    154.8        4         3246            0      0     3246
## 3     150  0.000000    297.9        8          481            0      0      481
## 4     278  0.000000    165.0        6         1411            0      0     1411
## 5     109  4.818617    279.1        3         4199            0      0     4199
## 6     105  0.000000    579.0        4         2352            0      0     2352
##   i.ep_TIA tia t.ep_Angor i.ep_Angor angor t.ep_AMI i.ep_AMI ami
## 1        0   0       4199          0     0     4199        0   0
## 2        0   0       3246          0     0     3246        0   0
## 3        0   0        481          0     0      481        0   0
## 4        0   0       1411          0     0     1411        0   0
## 5        0   0       4199          0     0     4199        0   0
## 6        0   0       2352          0     0     2352        0   0

STROKE

bbdd_clusters_8 %>% group_by(sex_female) %>% group_split %>% map(function(data){
  autoplot(survfit(Surv(t.ep_StrokeI, i.ep_StrokeI) ~ clusters, data=data), main="Stroke survival", censor.size = 0, surv.geom = "line",
           conf.int.fill=NULL)
})
## [[1]]

## 
## [[2]]

summary(as.factor(bbdd_clusters_8$clusters))
##   1   2   3   4   5   6   7   8 
## 326 237 244 266 363 176 341 261
bbdd_clusters_8$clusters <- as.factor(bbdd_clusters_8$clusters)

clusters_men <- bbdd_clusters_8 %>% filter(sex_female=="Men") #FILTRO CUANDO NO APARECE EVENTO
clusters_men$clusters <- relevel(clusters_men$clusters, 4) #REFERENCIA MAYOR SUPERVIVENCIA

clusters_women <- bbdd_clusters_8 %>% filter(sex_female=="Women")
clusters_women$clusters <- relevel(clusters_women$clusters, ref = 4)

clusters = list(clusters_men, clusters_women)

bbdd_clusters_8_surv <- clusters %>%
  map(function(data) {mod = coxph(Surv(t.ep_StrokeI, i.ep_StrokeI)~age + BMI+ clusters, data=data)
      tidy(mod, conf.int= TRUE, exponentiate=TRUE)})
head(bbdd_clusters_8_surv)
## [[1]]
## # A tibble: 9 x 7
##   term      estimate std.error statistic    p.value conf.low conf.high
##   <chr>        <dbl>     <dbl>     <dbl>      <dbl>    <dbl>     <dbl>
## 1 age           1.08    0.0166   4.44    0.00000900    1.04       1.11
## 2 BMI           1.00    0.0384  -0.00122 0.999         0.927      1.08
## 3 clusters1     4.59    1.10     1.39    0.165         0.534     39.5 
## 4 clusters2     2.77    1.16     0.883   0.377         0.288     26.7 
## 5 clusters3     4.67    1.05     1.46    0.144         0.591     36.9 
## 6 clusters5     2.35    1.16     0.739   0.460         0.244     22.6 
## 7 clusters6     4.72    1.16     1.34    0.179         0.490     45.5 
## 8 clusters7     2.69    1.06     0.931   0.352         0.336     21.5 
## 9 clusters8     5.52    1.05     1.63    0.104         0.706     43.1 
## 
## [[2]]
## # A tibble: 9 x 7
##   term      estimate std.error statistic  p.value conf.low conf.high
##   <chr>        <dbl>     <dbl>     <dbl>    <dbl>    <dbl>     <dbl>
## 1 age          1.05     0.0134     3.41  0.000660    1.02       1.07
## 2 BMI          0.950    0.0261    -1.95  0.0507      0.903      1.00
## 3 clusters1    1.53     0.613      0.692 0.489       0.460      5.08
## 4 clusters2    1.72     0.646      0.841 0.400       0.485      6.11
## 5 clusters3    5.05     0.613      2.64  0.00826     1.52      16.8 
## 6 clusters5    2.10     0.572      1.30  0.194       0.685      6.46
## 7 clusters6    3.11     0.628      1.81  0.0708      0.908     10.7 
## 8 clusters7    3.15     0.584      1.96  0.0496      1.00       9.88
## 9 clusters8    1.85     0.708      0.870 0.384       0.462      7.42
options(repr.plot.width = 9, repr.plot.height = 3)
bbdd_clusters_8_surv[[1]] %>%
    ggplot(aes(estimate, term)) +
    geom_vline(xintercept = 1, lty = "dashed") +
    geom_linerange(aes(xmin = conf.low, xmax = conf.high)) +
    geom_point(aes(color = term)) +
    theme_bw() +
    theme(legend.position = "none") +
    xlim(-1,6) +
    labs(x = "Hazard ratio")
## Warning: Removed 7 rows containing missing values (geom_segment).

options(repr.plot.width = 9, repr.plot.height = 3)
bbdd_clusters_8_surv[[2]] %>%
    ggplot(aes(estimate, term)) +
    geom_vline(xintercept = 1, lty = "dashed") +
    geom_linerange(aes(xmin = conf.low, xmax = conf.high)) +
    geom_point(aes(color = term)) +
    theme_bw() +
    theme(legend.position = "none") +
    xlim(-1, 6) +
    labs(x = "Hazard ratio")
## Warning: Removed 6 rows containing missing values (geom_segment).

TIA

bbdd_clusters_8 %>% group_by(sex_female) %>% group_split %>% map(function(data){
  autoplot(survfit(Surv(t.ep_TIA, i.ep_TIA) ~ clusters, data=data), main="TIA survival", censor.size = 0, surv.geom = "line",
           conf.int.fill=NULL)
})
## [[1]]

## 
## [[2]]

summary(as.factor(bbdd_clusters_8$clusters))
##   1   2   3   4   5   6   7   8 
## 326 237 244 266 363 176 341 261
bbdd_clusters_8$clusters <- as.factor(bbdd_clusters_8$clusters)

clusters_men <- bbdd_clusters_8 %>% filter(clusters !=5, sex_female=="Men") #FILTRO CUANDO NO APARECE EVENTO
clusters_men$clusters <- relevel(clusters_men$clusters, ref = 2) #REFERENCIA MAYOR SUPERVIVENCIA

clusters_women <- bbdd_clusters_8 %>% filter(sex_female=="Women")
clusters_women$clusters <- relevel(clusters_women$clusters, ref = 8)

clusters = list(clusters_men, clusters_women)

bbdd_clusters_8_surv <- clusters %>%
  map(function(data) {mod = coxph(Surv(t.ep_TIA, i.ep_TIA)~age + BMI+ clusters, data=data)
      tidy(mod, conf.int= TRUE, exponentiate=TRUE)})
head(bbdd_clusters_8_surv)
## [[1]]
## # A tibble: 9 x 7
##   term      estimate std.error statistic p.value conf.low conf.high
##   <chr>        <dbl>     <dbl>     <dbl>   <dbl>    <dbl>     <dbl>
## 1 age          1.05     0.0224   2.06     0.0398   1.00        1.09
## 2 BMI          0.964    0.0570  -0.647    0.517    0.862       1.08
## 3 clusters1    2.85     1.16     0.904    0.366    0.295      27.5 
## 4 clusters3    3.19     1.08     1.07     0.283    0.383      26.6 
## 5 clusters4    1.00     1.41     0.00289  0.998    0.0627     16.1 
## 6 clusters5   NA        0       NA       NA       NA          NA   
## 7 clusters6    1.52     1.42     0.294    0.769    0.0945     24.4 
## 8 clusters7    1.38     1.12     0.288    0.773    0.154      12.4 
## 9 clusters8    2.77     1.10     0.929    0.353    0.323      23.7 
## 
## [[2]]
## # A tibble: 9 x 7
##   term      estimate std.error statistic p.value conf.low conf.high
##   <chr>        <dbl>     <dbl>     <dbl>   <dbl>    <dbl>     <dbl>
## 1 age          1.04     0.0145    2.69   0.00716    1.01       1.07
## 2 BMI          0.973    0.0278   -0.978  0.328      0.922      1.03
## 3 clusters1    1.12     0.678     0.166  0.868      0.296      4.23
## 4 clusters2    1.27     0.710     0.335  0.738      0.316      5.10
## 5 clusters3    2.16     0.731     1.05   0.293      0.515      9.03
## 6 clusters4    1.31     0.691     0.395  0.693      0.339      5.09
## 7 clusters5    1.25     0.659     0.345  0.730      0.345      4.56
## 8 clusters6    1.22     0.764     0.255  0.799      0.272      5.43
## 9 clusters7    1.06     0.731     0.0790 0.937      0.253      4.44
options(repr.plot.width = 9, repr.plot.height = 3)
bbdd_clusters_8_surv[[1]] %>%
    ggplot(aes(estimate, term)) +
    geom_vline(xintercept = 1, lty = "dashed") +
    geom_linerange(aes(xmin = conf.low, xmax = conf.high)) +
    geom_point(aes(color = term)) +
    theme_bw() +
    theme(legend.position = "none") +
    xlim(-1,6) +
    labs(x = "Hazard ratio")
## Warning: Removed 7 rows containing missing values (geom_segment).
## Warning: Removed 1 rows containing missing values (geom_point).

options(repr.plot.width = 9, repr.plot.height = 3)
bbdd_clusters_8_surv[[2]] %>%
    ggplot(aes(estimate, term)) +
    geom_vline(xintercept = 1, lty = "dashed") +
    geom_linerange(aes(xmin = conf.low, xmax = conf.high)) +
    geom_point(aes(color = term)) +
    theme_bw() +
    theme(legend.position = "none") +
    xlim(-1, 6) +
    labs(x = "Hazard ratio")
## Warning: Removed 1 rows containing missing values (geom_segment).

AMI

bbdd_clusters_8 %>% group_by(sex_female) %>% group_split %>% map(function(data){
  autoplot(survfit(Surv(t.ep_AMI, i.ep_AMI) ~ clusters, data=data), main="AMI survival", censor.size = 0, surv.geom = "line",
           conf.int.fill=NULL)
})
## [[1]]

## 
## [[2]]

summary(as.factor(bbdd_clusters_8$clusters))
##   1   2   3   4   5   6   7   8 
## 326 237 244 266 363 176 341 261
bbdd_clusters_8$clusters <- as.factor(bbdd_clusters_8$clusters)

clusters_men <- bbdd_clusters_8 %>% filter(clusters !=6, sex_female=="Men") #FILTRO CUANDO NO APARECE EVENTO
clusters_men$clusters <- relevel(clusters_men$clusters, 5) #REFERENCIA MAYOR SUPERVIVENCIA

clusters_women <- bbdd_clusters_8 %>% filter(clusters !=8, clusters !=4, sex_female=="Women")
clusters_women$clusters <- relevel(clusters_women$clusters, ref = 5)

clusters = list(clusters_men, clusters_women)

bbdd_clusters_8_surv <- clusters %>%
  map(function(data) {mod = coxph(Surv(t.ep_AMI, i.ep_AMI)~age + BMI+ clusters, data=data)
      tidy(mod, conf.int= TRUE, exponentiate=TRUE)})
head(bbdd_clusters_8_surv)
## [[1]]
## # A tibble: 9 x 7
##   term      estimate std.error statistic p.value conf.low conf.high
##   <chr>        <dbl>     <dbl>     <dbl>   <dbl>    <dbl>     <dbl>
## 1 age          1.03     0.0186     1.77   0.0764    0.997      1.07
## 2 BMI          0.943    0.0477    -1.23   0.217     0.859      1.04
## 3 clusters1    5.69     1.10       1.58   0.114     0.659     49.1 
## 4 clusters2    6.40     1.10       1.69   0.0908    0.745     54.9 
## 5 clusters3    2.48     1.12       0.813  0.416     0.277     22.3 
## 6 clusters4    5.42     1.12       1.51   0.131     0.603     48.6 
## 7 clusters6   NA        0         NA     NA        NA         NA   
## 8 clusters7    2.56     1.08       0.869  0.385     0.307     21.4 
## 9 clusters8    3.68     1.10       1.19   0.235     0.429     31.6 
## 
## [[2]]
## # A tibble: 9 x 7
##   term      estimate std.error statistic p.value conf.low conf.high
##   <chr>        <dbl>     <dbl>     <dbl>   <dbl>    <dbl>     <dbl>
## 1 age           1.04    0.0258     1.67   0.0948    0.993      1.10
## 2 BMI           1.01    0.0454     0.325  0.745     0.928      1.11
## 3 clusters1     3.82    1.16       1.16   0.247     0.395     36.9 
## 4 clusters2     7.33    1.12       1.78   0.0752    0.817     65.8 
## 5 clusters3     3.69    1.42       0.922  0.356     0.230     59.1 
## 6 clusters4    NA       0         NA     NA        NA         NA   
## 7 clusters6     7.75    1.16       1.77   0.0763    0.805     74.6 
## 8 clusters7     8.98    1.10       2.00   0.0452    1.05      77.0 
## 9 clusters8    NA       0         NA     NA        NA         NA
options(repr.plot.width = 9, repr.plot.height = 3)
bbdd_clusters_8_surv[[1]] %>%
    ggplot(aes(estimate, term)) +
    geom_vline(xintercept = 1, lty = "dashed") +
    geom_linerange(aes(xmin = conf.low, xmax = conf.high)) +
    geom_point(aes(color = term)) +
    theme_bw() +
    theme(legend.position = "none") +
    xlim(-1,10) +
    labs(x = "Hazard ratio")
## Warning: Removed 7 rows containing missing values (geom_segment).
## Warning: Removed 1 rows containing missing values (geom_point).

options(repr.plot.width = 9, repr.plot.height = 3)
bbdd_clusters_8_surv[[2]] %>%
    ggplot(aes(estimate, term)) +
    geom_vline(xintercept = 1, lty = "dashed") +
    geom_linerange(aes(xmin = conf.low, xmax = conf.high)) +
    geom_point(aes(color = term)) +
    theme_bw() +
    theme(legend.position = "none") +
    xlim(-1, 10) +
    labs(x = "Hazard ratio")
## Warning: Removed 7 rows containing missing values (geom_segment).
## Warning: Removed 2 rows containing missing values (geom_point).

ANGOR

bbdd_clusters_8 %>% group_by(sex_female) %>% group_split %>% map(function(data){
  autoplot(survfit(Surv(t.ep_Angor, i.ep_Angor) ~ clusters, data=data), main="Angor survival", censor.size = 0, surv.geom = "line",
           conf.int.fill=NULL)
})
## [[1]]

## 
## [[2]]

summary(as.factor(bbdd_clusters_8$clusters))
##   1   2   3   4   5   6   7   8 
## 326 237 244 266 363 176 341 261
bbdd_clusters_8$clusters <- as.factor(bbdd_clusters_8$clusters)

clusters_men <- bbdd_clusters_8 %>% filter(sex_female=="Men") #FILTRO CUANDO NO APARECE EVENTO
clusters_men$clusters <- relevel(clusters_men$clusters, 6) #REFERENCIA MAYOR SUPERVIVENCIA

clusters_women <- bbdd_clusters_8 %>% filter(sex_female=="Women")
clusters_women$clusters <- relevel(clusters_women$clusters, ref = 4)

clusters = list(clusters_men, clusters_women)

bbdd_clusters_8_surv <- clusters %>%
  map(function(data) {mod = coxph(Surv(t.ep_Angor, i.ep_Angor)~age + BMI+ clusters, data=data)
      tidy(mod, conf.int= TRUE, exponentiate=TRUE)})
head(bbdd_clusters_8_surv)
## [[1]]
## # A tibble: 9 x 7
##   term      estimate std.error statistic p.value conf.low conf.high
##   <chr>        <dbl>     <dbl>     <dbl>   <dbl>    <dbl>     <dbl>
## 1 age           1.03    0.0164     1.71   0.0868    0.996      1.06
## 2 BMI           1.08    0.0351     2.16   0.0307    1.01       1.16
## 3 clusters1     1.76    1.23       0.458  0.647     0.157     19.7 
## 4 clusters2     4.86    1.09       1.46   0.146     0.578     40.8 
## 5 clusters3     2.42    1.08       0.817  0.414     0.290     20.3 
## 6 clusters4     1.58    1.23       0.374  0.708     0.143     17.5 
## 7 clusters5     3.09    1.10       1.03   0.303     0.361     26.5 
## 8 clusters7     1.79    1.09       0.537  0.591     0.213     15.0 
## 9 clusters8     1.75    1.12       0.497  0.619     0.193     15.8 
## 
## [[2]]
## # A tibble: 9 x 7
##   term      estimate std.error statistic p.value conf.low conf.high
##   <chr>        <dbl>     <dbl>     <dbl>   <dbl>    <dbl>     <dbl>
## 1 age          1.02     0.0191     0.856  0.392     0.979      1.06
## 2 BMI          0.968    0.0378    -0.862  0.388     0.899      1.04
## 3 clusters1    4.61     1.08       1.41   0.157     0.554     38.3 
## 4 clusters2    3.51     1.16       1.09   0.277     0.365     33.8 
## 5 clusters3    6.69     1.16       1.65   0.0999    0.695     64.4 
## 6 clusters5    3.82     1.08       1.24   0.215     0.459     31.8 
## 7 clusters6    4.67     1.16       1.33   0.183     0.484     44.9 
## 8 clusters7    2.23     1.23       0.653  0.514     0.202     24.6 
## 9 clusters8    1.65     1.42       0.355  0.723     0.103     26.5
options(repr.plot.width = 9, repr.plot.height = 3)
bbdd_clusters_8_surv[[1]] %>%
    ggplot(aes(estimate, term)) +
    geom_vline(xintercept = 1, lty = "dashed") +
    geom_linerange(aes(xmin = conf.low, xmax = conf.high)) +
    geom_point(aes(color = term)) +
    theme_bw() +
    theme(legend.position = "none") +
    xlim(-1,10) +
    labs(x = "Hazard ratio")
## Warning: Removed 7 rows containing missing values (geom_segment).

options(repr.plot.width = 9, repr.plot.height = 3)
bbdd_clusters_8_surv[[2]] %>%
    ggplot(aes(estimate, term)) +
    geom_vline(xintercept = 1, lty = "dashed") +
    geom_linerange(aes(xmin = conf.low, xmax = conf.high)) +
    geom_point(aes(color = term)) +
    theme_bw() +
    theme(legend.position = "none") +
    xlim(-1, 10) +
    labs(x = "Hazard ratio")
## Warning: Removed 7 rows containing missing values (geom_segment).

9 clusters

bbdd_clusters_9 <- create_bbdd_clusters(gmm_9)
head(bbdd_clusters_9)
##   rowId age   BMI HbA1c Leukocytes Monocytes cLDL sex_female  Tg cHDL DBP SBP
## 1  3785  61 19.27   6.5        6.6       8.8  123        Men  87   75  80 150
## 2  4678  67 24.71   7.7        9.5       5.4   83        Men  62   60  60 110
## 3  6049  56 34.32   7.9        7.3       5.6  110        Men  74   52  84 123
## 4 25754  70 30.11  10.2        8.1       7.0  220        Men 168   47  72 130
## 5 27214  71 24.91   5.7        3.5       7.1  122        Men 214   58  60 120
## 6 30046  71 33.96   6.5        6.6      11.5  107        Men 160   48  75 145
##   Glucose  TimeT2DM Ferritin clusters t.ep_StrokeI i.ep_StrokeI stroke t.ep_TIA
## 1     136 10.023272    100.7        7         4199            0      0     4199
## 2     227  8.996578    154.8        1         3246            0      0     3246
## 3     150  0.000000    297.9        8          481            0      0      481
## 4     278  0.000000    165.0        6         1411            0      0     1411
## 5     109  4.818617    279.1        5         4199            0      0     4199
## 6     105  0.000000    579.0        4         2352            0      0     2352
##   i.ep_TIA tia t.ep_Angor i.ep_Angor angor t.ep_AMI i.ep_AMI ami
## 1        0   0       4199          0     0     4199        0   0
## 2        0   0       3246          0     0     3246        0   0
## 3        0   0        481          0     0      481        0   0
## 4        0   0       1411          0     0     1411        0   0
## 5        0   0       4199          0     0     4199        0   0
## 6        0   0       2352          0     0     2352        0   0

STROKE

bbdd_clusters_9 %>% group_by(sex_female) %>% group_split %>% map(function(data){
  autoplot(survfit(Surv(t.ep_StrokeI, i.ep_StrokeI) ~ clusters, data=data), main="Stroke survival", censor.size = 0, surv.geom = "line",
           conf.int.fill=NULL)
})
## [[1]]

## 
## [[2]]

summary(as.factor(bbdd_clusters_9$clusters))
##   1   2   3   4   5   6   7   8   9 
## 191 215 224 356 310 163 334 222 199
bbdd_clusters_9$clusters <- as.factor(bbdd_clusters_9$clusters)

clusters_men <- bbdd_clusters_9 %>% filter(sex_female=="Men") #FILTRO CUANDO NO APARECE EVENTO
clusters_men$clusters <- relevel(clusters_men$clusters, 9) #REFERENCIA MAYOR SUPERVIVENCIA

clusters_women <- bbdd_clusters_9 %>% filter(sex_female=="Women")
clusters_women$clusters <- relevel(clusters_women$clusters, ref = 1)

clusters = list(clusters_men, clusters_women)

bbdd_clusters_9_surv <- clusters %>%
  map(function(data) {mod = coxph(Surv(t.ep_StrokeI, i.ep_StrokeI)~age + BMI+ clusters, data=data)
      tidy(mod, conf.int= TRUE, exponentiate=TRUE)})
head(bbdd_clusters_9_surv)
## [[1]]
## # A tibble: 10 x 7
##    term      estimate std.error statistic   p.value conf.low conf.high
##    <chr>        <dbl>     <dbl>     <dbl>     <dbl>    <dbl>     <dbl>
##  1 age          1.08     0.0166     4.41  0.0000104    1.04       1.11
##  2 BMI          0.995    0.0374    -0.126 0.900        0.925      1.07
##  3 clusters1    3.74     1.10       1.20  0.229        0.436     32.1 
##  4 clusters2    3.69     1.16       1.13  0.259        0.383     35.5 
##  5 clusters3    3.59     1.08       1.18  0.237        0.431     29.9 
##  6 clusters4    4.11     1.08       1.31  0.191        0.494     34.2 
##  7 clusters5    2.60     1.23       0.777 0.437        0.234     28.8 
##  8 clusters6    7.18     1.12       1.76  0.0781       0.801     64.4 
##  9 clusters7    2.89     1.07       0.991 0.322        0.355     23.5 
## 10 clusters8    6.14     1.06       1.71  0.0875       0.766     49.3 
## 
## [[2]]
## # A tibble: 10 x 7
##    term      estimate std.error statistic  p.value conf.low conf.high
##    <chr>        <dbl>     <dbl>     <dbl>    <dbl>    <dbl>     <dbl>
##  1 age          1.05     0.0136      3.37 0.000762    1.02       1.08
##  2 BMI          0.953    0.0254     -1.92 0.0553      0.906      1.00
##  3 clusters2    5.71     1.07        1.63 0.103       0.701     46.4 
##  4 clusters3   11.1      1.07        2.25 0.0243      1.37      90.7 
##  5 clusters4    3.56     1.07        1.19 0.235       0.438     29.0 
##  6 clusters5    4.12     1.06        1.34 0.179       0.521     32.7 
##  7 clusters6    7.40     1.08        1.85 0.0642      0.889     61.6 
##  8 clusters7    8.54     1.04        2.05 0.0401      1.10      66.2 
##  9 clusters8    5.67     1.12        1.55 0.122       0.629     51.1 
## 10 clusters9    8.22     1.05        2.00 0.0457      1.04      65.0
options(repr.plot.width = 9, repr.plot.height = 3)
bbdd_clusters_9_surv[[1]] %>%
    ggplot(aes(estimate, term)) +
    geom_vline(xintercept = 1, lty = "dashed") +
    geom_linerange(aes(xmin = conf.low, xmax = conf.high)) +
    geom_point(aes(color = term)) +
    theme_bw() +
    theme(legend.position = "none") +
    xlim(-1,6) +
    labs(x = "Hazard ratio")
## Warning: Removed 8 rows containing missing values (geom_segment).
## Warning: Removed 2 rows containing missing values (geom_point).

options(repr.plot.width = 9, repr.plot.height = 3)
bbdd_clusters_9_surv[[2]] %>%
    ggplot(aes(estimate, term)) +
    geom_vline(xintercept = 1, lty = "dashed") +
    geom_linerange(aes(xmin = conf.low, xmax = conf.high)) +
    geom_point(aes(color = term)) +
    theme_bw() +
    theme(legend.position = "none") +
    xlim(-1, 6) +
    labs(x = "Hazard ratio")
## Warning: Removed 8 rows containing missing values (geom_segment).
## Warning: Removed 4 rows containing missing values (geom_point).

TIA

bbdd_clusters_9 %>% group_by(sex_female) %>% group_split %>% map(function(data){
  autoplot(survfit(Surv(t.ep_TIA, i.ep_TIA) ~ clusters, data=data), main="TIA survival", censor.size = 0, surv.geom = "line",
           conf.int.fill=NULL)
})
## [[1]]

## 
## [[2]]

summary(as.factor(bbdd_clusters_9$clusters))
##   1   2   3   4   5   6   7   8   9 
## 191 215 224 356 310 163 334 222 199
bbdd_clusters_9$clusters <- as.factor(bbdd_clusters_9$clusters)

clusters_men <- bbdd_clusters_9 %>% filter(clusters !=6, clusters !=5, clusters !=2, sex_female=="Men") #FILTRO CUANDO NO APARECE EVENTO
clusters_men$clusters <- relevel(clusters_men$clusters, 8) #REFERENCIA MAYOR SUPERVIVENCIA

clusters_women <- bbdd_clusters_9 %>% filter(sex_female=="Women")
clusters_women$clusters <- relevel(clusters_women$clusters, ref = 8)

clusters = list(clusters_men, clusters_women)

bbdd_clusters_9_surv <- clusters %>%
  map(function(data) {mod = coxph(Surv(t.ep_TIA, i.ep_TIA)~age + BMI+ clusters, data=data)
      tidy(mod, conf.int= TRUE, exponentiate=TRUE)})
head(bbdd_clusters_9_surv)
## [[1]]
## # A tibble: 10 x 7
##    term      estimate std.error statistic p.value conf.low conf.high
##    <chr>        <dbl>     <dbl>     <dbl>   <dbl>    <dbl>     <dbl>
##  1 age          1.04     0.0223     1.78   0.0757    0.996      1.09
##  2 BMI          0.973    0.0541    -0.504  0.614     0.875      1.08
##  3 clusters1    2.32     0.872      0.965  0.335     0.420     12.8 
##  4 clusters2   NA        0         NA     NA        NA         NA   
##  5 clusters3    1.21     0.915      0.209  0.834     0.202      7.27
##  6 clusters4    2.85     0.818      1.28   0.200     0.574     14.2 
##  7 clusters5   NA        0         NA     NA        NA         NA   
##  8 clusters6   NA        0         NA     NA        NA         NA   
##  9 clusters7    1.19     0.868      0.196  0.845     0.216      6.50
## 10 clusters9    1.33     1.00       0.282  0.778     0.186      9.47
## 
## [[2]]
## # A tibble: 10 x 7
##    term      estimate std.error statistic p.value conf.low conf.high
##    <chr>        <dbl>     <dbl>     <dbl>   <dbl>    <dbl>     <dbl>
##  1 age          1.04     0.0146     2.56   0.0105    1.01       1.07
##  2 BMI          0.975    0.0278    -0.895  0.371     0.924      1.03
##  3 clusters1    3.10     1.12       1.01   0.313     0.344     27.9 
##  4 clusters2    3.70     1.08       1.21   0.227     0.443     30.9 
##  5 clusters3    6.92     1.08       1.79   0.0734    0.832     57.6 
##  6 clusters4    3.47     1.06       1.18   0.239     0.438     27.5 
##  7 clusters5    2.51     1.07       0.859  0.391     0.308     20.4 
##  8 clusters6    3.51     1.12       1.12   0.262     0.392     31.5 
##  9 clusters7    3.40     1.08       1.13   0.258     0.407     28.3 
## 10 clusters9    3.41     1.10       1.12   0.263     0.397     29.3
options(repr.plot.width = 9, repr.plot.height = 3)
bbdd_clusters_9_surv[[1]] %>%
    ggplot(aes(estimate, term)) +
    geom_vline(xintercept = 1, lty = "dashed") +
    geom_linerange(aes(xmin = conf.low, xmax = conf.high)) +
    geom_point(aes(color = term)) +
    theme_bw() +
    theme(legend.position = "none") +
    xlim(-1,8) +
    labs(x = "Hazard ratio")
## Warning: Removed 6 rows containing missing values (geom_segment).
## Warning: Removed 3 rows containing missing values (geom_point).

options(repr.plot.width = 9, repr.plot.height = 3)
bbdd_clusters_9_surv[[2]] %>%
    ggplot(aes(estimate, term)) +
    geom_vline(xintercept = 1, lty = "dashed") +
    geom_linerange(aes(xmin = conf.low, xmax = conf.high)) +
    geom_point(aes(color = term)) +
    theme_bw() +
    theme(legend.position = "none") +
    xlim(-1, 8) +
    labs(x = "Hazard ratio")
## Warning: Removed 8 rows containing missing values (geom_segment).

AMI

bbdd_clusters_9 %>% group_by(sex_female) %>% group_split %>% map(function(data){
  autoplot(survfit(Surv(t.ep_AMI, i.ep_AMI) ~ clusters, data=data), main="AMI survival", censor.size = 0, surv.geom = "line",
           conf.int.fill=NULL)
})
## [[1]]

## 
## [[2]]

summary(as.factor(bbdd_clusters_9$clusters))
##   1   2   3   4   5   6   7   8   9 
## 191 215 224 356 310 163 334 222 199
bbdd_clusters_9$clusters <- as.factor(bbdd_clusters_9$clusters)

clusters_men <- bbdd_clusters_9 %>% filter(clusters !=5, sex_female=="Men") #FILTRO CUANDO NO APARECE EVENTO
clusters_men$clusters <- relevel(clusters_men$clusters, 6) #REFERENCIA MAYOR SUPERVIVENCIA

clusters_women <- bbdd_clusters_9 %>% filter(clusters !=8, clusters !=4, clusters !=1, sex_female=="Women")
clusters_women$clusters <- relevel(clusters_women$clusters, ref = 5)

clusters = list(clusters_men, clusters_women)

bbdd_clusters_9_surv <- clusters %>%
  map(function(data) {mod = coxph(Surv(t.ep_AMI, i.ep_AMI)~age + BMI+ clusters, data=data)
      tidy(mod, conf.int= TRUE, exponentiate=TRUE)})
head(bbdd_clusters_9_surv)
## [[1]]
## # A tibble: 10 x 7
##    term      estimate std.error statistic p.value conf.low conf.high
##    <chr>        <dbl>     <dbl>     <dbl>   <dbl>    <dbl>     <dbl>
##  1 age          1.03     0.0188    1.80    0.0726    0.997      1.07
##  2 BMI          0.938    0.0474   -1.36    0.174     0.854      1.03
##  3 clusters1    2.10     1.13      0.658   0.511     0.231     19.1 
##  4 clusters2    3.23     1.12      1.05    0.295     0.360     29.0 
##  5 clusters3    1.80     1.10      0.535   0.593     0.210     15.4 
##  6 clusters4    2.24     1.10      0.734   0.463     0.260     19.2 
##  7 clusters5   NA        0        NA      NA        NA         NA   
##  8 clusters7    1.07     1.12      0.0575  0.954     0.118      9.60
##  9 clusters8    2.48     1.10      0.828   0.407     0.289     21.4 
## 10 clusters9    1.25     1.23      0.180   0.857     0.113     13.8 
## 
## [[2]]
## # A tibble: 10 x 7
##    term      estimate std.error statistic p.value conf.low conf.high
##    <chr>        <dbl>     <dbl>     <dbl>   <dbl>    <dbl>     <dbl>
##  1 age           1.05    0.0265     1.73   0.0844    0.994      1.10
##  2 BMI           1.01    0.0443     0.304  0.761     0.929      1.11
##  3 clusters1    NA       0         NA     NA        NA         NA   
##  4 clusters2     3.78    0.869      1.53   0.125     0.690     20.8 
##  5 clusters3     3.45    1.00       1.24   0.217     0.484     24.6 
##  6 clusters4    NA       0         NA     NA        NA         NA   
##  7 clusters6     2.66    1.00       0.977  0.329     0.374     18.9 
##  8 clusters7     2.56    0.915      1.03   0.305     0.426     15.4 
##  9 clusters8    NA       0         NA     NA        NA         NA   
## 10 clusters9     4.00    0.867      1.60   0.110     0.732     21.9
options(repr.plot.width = 9, repr.plot.height = 3)
bbdd_clusters_9_surv[[1]] %>%
    ggplot(aes(estimate, term)) +
    geom_vline(xintercept = 1, lty = "dashed") +
    geom_linerange(aes(xmin = conf.low, xmax = conf.high)) +
    geom_point(aes(color = term)) +
    theme_bw() +
    theme(legend.position = "none") +
    xlim(-1,6) +
    labs(x = "Hazard ratio")
## Warning: Removed 8 rows containing missing values (geom_segment).
## Warning: Removed 1 rows containing missing values (geom_point).

options(repr.plot.width = 9, repr.plot.height = 3)
bbdd_clusters_9_surv[[2]] %>%
    ggplot(aes(estimate, term)) +
    geom_vline(xintercept = 1, lty = "dashed") +
    geom_linerange(aes(xmin = conf.low, xmax = conf.high)) +
    geom_point(aes(color = term)) +
    theme_bw() +
    theme(legend.position = "none") +
    xlim(-1, 6) +
    labs(x = "Hazard ratio")
## Warning: Removed 8 rows containing missing values (geom_segment).
## Warning: Removed 3 rows containing missing values (geom_point).

ANGOR

bbdd_clusters_9 %>% group_by(sex_female) %>% group_split %>% map(function(data){
  autoplot(survfit(Surv(t.ep_Angor, i.ep_Angor) ~ clusters, data=data), main="Angor survival", censor.size = 0, surv.geom = "line",
           conf.int.fill=NULL)
})
## [[1]]

## 
## [[2]]

summary(as.factor(bbdd_clusters_9$clusters))
##   1   2   3   4   5   6   7   8   9 
## 191 215 224 356 310 163 334 222 199
bbdd_clusters_9$clusters <- as.factor(bbdd_clusters_9$clusters)

clusters_men <- bbdd_clusters_9 %>% filter(sex_female=="Men") #FILTRO CUANDO NO APARECE EVENTO
clusters_men$clusters <- relevel(clusters_men$clusters, 9) #REFERENCIA MAYOR SUPERVIVENCIA

clusters_women <- bbdd_clusters_9 %>% filter(sex_female=="Women")
clusters_women$clusters <- relevel(clusters_women$clusters, ref = 4)

clusters = list(clusters_men, clusters_women)

bbdd_clusters_9_surv <- clusters %>%
  map(function(data) {mod = coxph(Surv(t.ep_Angor, i.ep_Angor)~age + BMI+ clusters, data=data)
      tidy(mod, conf.int= TRUE, exponentiate=TRUE)})
head(bbdd_clusters_9_surv)
## [[1]]
## # A tibble: 10 x 7
##    term      estimate std.error statistic p.value conf.low conf.high
##    <chr>        <dbl>     <dbl>     <dbl>   <dbl>    <dbl>     <dbl>
##  1 age           1.03    0.0168     1.92   0.0548    0.999      1.07
##  2 BMI           1.08    0.0347     2.15   0.0317    1.01       1.15
##  3 clusters1     2.83    1.16       0.898  0.369     0.292     27.5 
##  4 clusters2     6.03    1.10       1.64   0.101     0.703     51.7 
##  5 clusters3     5.13    1.05       1.55   0.121     0.649     40.5 
##  6 clusters4     2.69    1.12       0.884  0.377     0.300     24.1 
##  7 clusters5     3.36    1.16       1.05   0.294     0.349     32.4 
##  8 clusters6     2.79    1.23       0.836  0.403     0.251     31.0 
##  9 clusters7     1.33    1.16       0.248  0.804     0.138     12.8 
## 10 clusters8     1.36    1.23       0.251  0.802     0.123     15.1 
## 
## [[2]]
## # A tibble: 10 x 7
##    term      estimate std.error statistic p.value conf.low conf.high
##    <chr>        <dbl>     <dbl>     <dbl>   <dbl>    <dbl>     <dbl>
##  1 age          1.01     0.0191     0.730  0.465     0.977      1.05
##  2 BMI          0.967    0.0375    -0.897  0.370     0.898      1.04
##  3 clusters1    6.25     1.16       1.59   0.113     0.649     60.1 
##  4 clusters2    6.53     1.12       1.68   0.0935    0.729     58.4 
##  5 clusters3    2.80     1.41       0.727  0.467     0.175     44.7 
##  6 clusters5    6.17     1.07       1.70   0.0890    0.758     50.3 
##  7 clusters6    4.28     1.23       1.19   0.235     0.388     47.3 
##  8 clusters7    4.40     1.15       1.28   0.200     0.457     42.3 
##  9 clusters8    2.32     1.42       0.595  0.552     0.145     37.3 
## 10 clusters9    5.25     1.15       1.44   0.151     0.546     50.4
options(repr.plot.width = 9, repr.plot.height = 3)
bbdd_clusters_9_surv[[1]] %>%
    ggplot(aes(estimate, term)) +
    geom_vline(xintercept = 1, lty = "dashed") +
    geom_linerange(aes(xmin = conf.low, xmax = conf.high)) +
    geom_point(aes(color = term)) +
    theme_bw() +
    theme(legend.position = "none") +
    xlim(-1,8) +
    labs(x = "Hazard ratio")
## Warning: Removed 8 rows containing missing values (geom_segment).

options(repr.plot.width = 9, repr.plot.height = 3)
bbdd_clusters_9_surv[[2]] %>%
    ggplot(aes(estimate, term)) +
    geom_vline(xintercept = 1, lty = "dashed") +
    geom_linerange(aes(xmin = conf.low, xmax = conf.high)) +
    geom_point(aes(color = term)) +
    theme_bw() +
    theme(legend.position = "none") +
    xlim(-1, 8) +
    labs(x = "Hazard ratio")
## Warning: Removed 8 rows containing missing values (geom_segment).

10 clusters

bbdd_clusters_10 <- create_bbdd_clusters(gmm10)
head(bbdd_clusters_10)
##   rowId age   BMI HbA1c Leukocytes Monocytes cLDL sex_female  Tg cHDL DBP SBP
## 1  3785  61 19.27   6.5        6.6       8.8  123        Men  87   75  80 150
## 2  4678  67 24.71   7.7        9.5       5.4   83        Men  62   60  60 110
## 3  6049  56 34.32   7.9        7.3       5.6  110        Men  74   52  84 123
## 4 25754  70 30.11  10.2        8.1       7.0  220        Men 168   47  72 130
## 5 27214  71 24.91   5.7        3.5       7.1  122        Men 214   58  60 120
## 6 30046  71 33.96   6.5        6.6      11.5  107        Men 160   48  75 145
##   Glucose  TimeT2DM Ferritin clusters t.ep_StrokeI i.ep_StrokeI stroke t.ep_TIA
## 1     136 10.023272    100.7        7         4199            0      0     4199
## 2     227  8.996578    154.8        9         3246            0      0     3246
## 3     150  0.000000    297.9        8          481            0      0      481
## 4     278  0.000000    165.0        6         1411            0      0     1411
## 5     109  4.818617    279.1        4         4199            0      0     4199
## 6     105  0.000000    579.0        4         2352            0      0     2352
##   i.ep_TIA tia t.ep_Angor i.ep_Angor angor t.ep_AMI i.ep_AMI ami
## 1        0   0       4199          0     0     4199        0   0
## 2        0   0       3246          0     0     3246        0   0
## 3        0   0        481          0     0      481        0   0
## 4        0   0       1411          0     0     1411        0   0
## 5        0   0       4199          0     0     4199        0   0
## 6        0   0       2352          0     0     2352        0   0

STROKE

bbdd_clusters_10 %>% group_by(sex_female) %>% group_split %>% map(function(data){
  autoplot(survfit(Surv(t.ep_StrokeI, i.ep_StrokeI) ~ clusters, data=data), main="Stroke survival", censor.size = 0, surv.geom = "line",
           conf.int.fill=NULL)
})
## [[1]]

## 
## [[2]]

summary(as.factor(bbdd_clusters_10$clusters))
##   1   2   3   4   5   6   7   8   9  10 
## 199 198 249 286 299 211 260 209 140 163
bbdd_clusters_10$clusters <- as.factor(bbdd_clusters_10$clusters)

clusters_men <- bbdd_clusters_10 %>% filter(clusters != 9, clusters !=5, sex_female=="Men") #FILTRO CUANDO NO APARECE EVENTO
clusters_men$clusters <- relevel(clusters_men$clusters, 2) #REFERENCIA MAYOR SUPERVIVENCIA

clusters_women <- bbdd_clusters_10 %>% filter(clusters !=1, sex_female=="Women")
clusters_women$clusters <- relevel(clusters_women$clusters, ref = 4)

clusters = list(clusters_men, clusters_women)

bbdd_clusters_10_surv <- clusters %>%
  map(function(data) {mod = coxph(Surv(t.ep_StrokeI, i.ep_StrokeI)~age + BMI+ clusters, data=data)
      tidy(mod, conf.int= TRUE, exponentiate=TRUE)})
head(bbdd_clusters_10_surv)
## [[1]]
## # A tibble: 11 x 7
##    term       estimate std.error statistic    p.value conf.low conf.high
##    <chr>         <dbl>     <dbl>     <dbl>      <dbl>    <dbl>     <dbl>
##  1 age           1.07     0.0166    4.18    0.0000293    1.04       1.11
##  2 BMI           0.999    0.0373   -0.0385  0.969        0.928      1.07
##  3 clusters1     1.26     0.838     0.271   0.786        0.243      6.49
##  4 clusters3     1.18     0.791     0.213   0.831        0.251      5.58
##  5 clusters4     1.40     0.803     0.417   0.677        0.290      6.74
##  6 clusters5    NA        0        NA      NA           NA         NA   
##  7 clusters6     2.08     0.868     0.842   0.400        0.379     11.4 
##  8 clusters7     0.705    0.913    -0.382   0.702        0.118      4.22
##  9 clusters8     2.13     0.792     0.954   0.340        0.451     10.0 
## 10 clusters9    NA        0        NA      NA           NA         NA   
## 11 clusters10    2.03     0.837     0.844   0.399        0.393     10.4 
## 
## [[2]]
## # A tibble: 11 x 7
##    term       estimate std.error statistic  p.value conf.low conf.high
##    <chr>         <dbl>     <dbl>     <dbl>    <dbl>    <dbl>     <dbl>
##  1 age           1.05     0.0136      3.27  0.00108    1.02       1.07
##  2 BMI           0.951    0.0264     -1.89  0.0585     0.903      1.00
##  3 clusters1    NA        0          NA    NA         NA         NA   
##  4 clusters2     2.13     0.708       1.07  0.286      0.532      8.52
##  5 clusters3     6.38     0.667       2.78  0.00545    1.73      23.6 
##  6 clusters5     2.03     0.659       1.07  0.284      0.557      7.37
##  7 clusters6     3.33     0.646       1.86  0.0626     0.939     11.8 
##  8 clusters7     2.57     0.677       1.40  0.163      0.683      9.70
##  9 clusters8     2.28     0.765       1.08  0.280      0.510     10.2 
## 10 clusters9     2.51     0.764       1.20  0.229      0.561     11.2 
## 11 clusters10    2.56     0.731       1.29  0.198      0.612     10.8
options(repr.plot.width = 9, repr.plot.height = 3)
bbdd_clusters_10_surv[[1]] %>%
    ggplot(aes(estimate, term)) +
    geom_vline(xintercept = 1, lty = "dashed") +
    geom_linerange(aes(xmin = conf.low, xmax = conf.high)) +
    geom_point(aes(color = term)) +
    theme_bw() +
    theme(legend.position = "none") +
    xlim(-1,4) +
    labs(x = "Hazard ratio")
## Warning: Removed 9 rows containing missing values (geom_segment).
## Warning: Removed 2 rows containing missing values (geom_point).

options(repr.plot.width = 9, repr.plot.height = 3)
bbdd_clusters_10_surv[[2]] %>%
    ggplot(aes(estimate, term)) +
    geom_vline(xintercept = 1, lty = "dashed") +
    geom_linerange(aes(xmin = conf.low, xmax = conf.high)) +
    geom_point(aes(color = term)) +
    theme_bw() +
    theme(legend.position = "none") +
    xlim(-1, 4) +
    labs(x = "Hazard ratio")
## Warning: Removed 9 rows containing missing values (geom_segment).
## Removed 2 rows containing missing values (geom_point).

TIA

bbdd_clusters_10 %>% group_by(sex_female) %>% group_split %>% map(function(data){
  autoplot(survfit(Surv(t.ep_TIA, i.ep_TIA) ~ clusters, data=data), main="TIA survival", censor.size = 0, surv.geom = "line",
           conf.int.fill=NULL)
})
## [[1]]

## 
## [[2]]

summary(as.factor(bbdd_clusters_10$clusters))
##   1   2   3   4   5   6   7   8   9  10 
## 199 198 249 286 299 211 260 209 140 163
bbdd_clusters_10$clusters <- as.factor(bbdd_clusters_10$clusters)

clusters_men <- bbdd_clusters_10 %>% filter(clusters != 6, clusters != 5,
                                            clusters != 2,sex_female=="Men") #FILTRO CUANDO NO APARECE EVENTO
clusters_men$clusters <- relevel(clusters_men$clusters, 9) #REFERENCIA MAYOR SUPERVIVENCIA

clusters_women <- bbdd_clusters_10 %>% filter(sex_female=="Women")
clusters_women$clusters <- relevel(clusters_women$clusters, ref = 8)

clusters = list(clusters_men, clusters_women)

bbdd_clusters_10_surv <- clusters %>%
  map(function(data) {mod = coxph(Surv(t.ep_TIA, i.ep_TIA)~age + BMI+ clusters, data=data)
      tidy(mod, conf.int= TRUE, exponentiate=TRUE)})
head(bbdd_clusters_10_surv)
## [[1]]
## # A tibble: 11 x 7
##    term       estimate std.error statistic p.value conf.low conf.high
##    <chr>         <dbl>     <dbl>     <dbl>   <dbl>    <dbl>     <dbl>
##  1 age           1.04     0.0220    1.85    0.0641   0.998       1.09
##  2 BMI           0.967    0.0551   -0.604   0.546    0.868       1.08
##  3 clusters1     1.28     1.23      0.199   0.842    0.114      14.3 
##  4 clusters2    NA        0        NA      NA       NA          NA   
##  5 clusters3     1.71     1.10      0.488   0.625    0.199      14.7 
##  6 clusters4     2.86     1.09      0.966   0.334    0.339      24.1 
##  7 clusters5    NA        0        NA      NA       NA          NA   
##  8 clusters6    NA        0        NA      NA       NA          NA   
##  9 clusters7     1.10     1.23      0.0771  0.939    0.0992     12.2 
## 10 clusters8     1.15     1.23      0.117   0.907    0.104      12.9 
## 11 clusters10    2.76     1.16      0.877   0.381    0.285      26.7 
## 
## [[2]]
## # A tibble: 11 x 7
##    term       estimate std.error statistic p.value conf.low conf.high
##    <chr>         <dbl>     <dbl>     <dbl>   <dbl>    <dbl>     <dbl>
##  1 age           1.04     0.0146     2.62  0.00872    1.01       1.07
##  2 BMI           0.974    0.0280    -0.948 0.343      0.922      1.03
##  3 clusters1     2.26     1.16       0.704 0.482      0.234     21.8 
##  4 clusters2     3.23     1.10       1.07  0.286      0.375     27.7 
##  5 clusters3     7.43     1.08       1.86  0.0635     0.893     61.8 
##  6 clusters4     3.69     1.08       1.21  0.227      0.444     30.7 
##  7 clusters5     3.13     1.06       1.08  0.282      0.391     25.0 
##  8 clusters6     3.11     1.08       1.05  0.295      0.372     26.0 
##  9 clusters7     4.16     1.07       1.33  0.182      0.512     33.9 
## 10 clusters9     2.25     1.23       0.663 0.507      0.204     24.9 
## 11 clusters10    3.63     1.12       1.15  0.249      0.405     32.5
options(repr.plot.width = 9, repr.plot.height = 3)
bbdd_clusters_10_surv[[1]] %>%
    ggplot(aes(estimate, term)) +
    geom_vline(xintercept = 1, lty = "dashed") +
    geom_linerange(aes(xmin = conf.low, xmax = conf.high)) +
    geom_point(aes(color = term)) +
    theme_bw() +
    theme(legend.position = "none") +
    xlim(-1,6) +
    labs(x = "Hazard ratio")
## Warning: Removed 9 rows containing missing values (geom_segment).
## Warning: Removed 3 rows containing missing values (geom_point).

options(repr.plot.width = 9, repr.plot.height = 3)
bbdd_clusters_10_surv[[2]] %>%
    ggplot(aes(estimate, term)) +
    geom_vline(xintercept = 1, lty = "dashed") +
    geom_linerange(aes(xmin = conf.low, xmax = conf.high)) +
    geom_point(aes(color = term)) +
    theme_bw() +
    theme(legend.position = "none") +
    xlim(-1, 6) +
    labs(x = "Hazard ratio")
## Warning: Removed 9 rows containing missing values (geom_segment).
## Warning: Removed 1 rows containing missing values (geom_point).

AMI

bbdd_clusters_10 %>% group_by(sex_female) %>% group_split %>% map(function(data){
  autoplot(survfit(Surv(t.ep_AMI, i.ep_AMI) ~ clusters, data=data), main="AMI survival", censor.size = 0, surv.geom = "line",
           conf.int.fill=NULL)
})
## [[1]]

## 
## [[2]]

summary(as.factor(bbdd_clusters_10$clusters))
##   1   2   3   4   5   6   7   8   9  10 
## 199 198 249 286 299 211 260 209 140 163
bbdd_clusters_10$clusters <- as.factor(bbdd_clusters_10$clusters)

clusters_men <- bbdd_clusters_10 %>% filter(clusters != 5, sex_female=="Men") #FILTRO CUANDO NO APARECE EVENTO
clusters_men$clusters <- relevel(clusters_men$clusters, 10) #REFERENCIA MAYOR SUPERVIVENCIA

clusters_women <- bbdd_clusters_10 %>% filter(clusters !=8,clusters != 1, clusters != 4, sex_female=="Women")
clusters_women$clusters <- relevel(clusters_women$clusters, ref = 5)

clusters = list(clusters_men, clusters_women)

bbdd_clusters_10_surv <- clusters %>%
  map(function(data) {mod = coxph(Surv(t.ep_AMI, i.ep_AMI)~age + BMI+ clusters, data=data)
      tidy(mod, conf.int= TRUE, exponentiate=TRUE)})
head(bbdd_clusters_10_surv)
## [[1]]
## # A tibble: 11 x 7
##    term      estimate std.error statistic p.value conf.low conf.high
##    <chr>        <dbl>     <dbl>     <dbl>   <dbl>    <dbl>     <dbl>
##  1 age          1.03     0.0189    1.54     0.123   0.992       1.07
##  2 BMI          0.944    0.0470   -1.23     0.219   0.861       1.03
##  3 clusters1    4.27     1.08      1.34     0.180   0.511      35.7 
##  4 clusters2    2.48     1.22      0.740    0.459   0.225      27.3 
##  5 clusters3    1.70     1.10      0.486    0.627   0.199      14.6 
##  6 clusters4    2.69     1.10      0.902    0.367   0.314      23.1 
##  7 clusters5   NA        0        NA       NA      NA          NA   
##  8 clusters6    2.31     1.23      0.684    0.494   0.209      25.6 
##  9 clusters7    1.77     1.16      0.494    0.621   0.184      17.1 
## 10 clusters8    3.37     1.10      1.11     0.268   0.393      28.9 
## 11 clusters9    1.04     1.42      0.0270   0.978   0.0646     16.7 
## 
## [[2]]
## # A tibble: 11 x 7
##    term       estimate std.error statistic p.value conf.low conf.high
##    <chr>         <dbl>     <dbl>     <dbl>   <dbl>    <dbl>     <dbl>
##  1 age            1.05    0.0259     1.75   0.0795    0.995      1.10
##  2 BMI            1.01    0.0446     0.206  0.837     0.925      1.10
##  3 clusters1     NA       0         NA     NA        NA         NA   
##  4 clusters2      7.19    1.12       1.76   0.0782    0.800     64.7 
##  5 clusters3      3.33    1.42       0.850  0.395     0.208     53.4 
##  6 clusters4     NA       0         NA     NA        NA         NA   
##  7 clusters6      2.82    1.23       0.843  0.399     0.253     31.3 
##  8 clusters7      4.76    1.16       1.35   0.177     0.495     45.8 
##  9 clusters8     NA       0         NA     NA        NA         NA   
## 10 clusters9      9.28    1.15       1.93   0.0538    0.964     89.2 
## 11 clusters10     7.29    1.16       1.72   0.0854    0.758     70.2
options(repr.plot.width = 9, repr.plot.height = 3)
bbdd_clusters_10_surv[[1]] %>%
    ggplot(aes(estimate, term)) +
    geom_vline(xintercept = 1, lty = "dashed") +
    geom_linerange(aes(xmin = conf.low, xmax = conf.high)) +
    geom_point(aes(color = term)) +
    theme_bw() +
    theme(legend.position = "none") +
    xlim(-1,6) +
    labs(x = "Hazard ratio")
## Warning: Removed 9 rows containing missing values (geom_segment).
## Warning: Removed 1 rows containing missing values (geom_point).

options(repr.plot.width = 9, repr.plot.height = 3)
bbdd_clusters_10_surv[[2]] %>%
    ggplot(aes(estimate, term)) +
    geom_vline(xintercept = 1, lty = "dashed") +
    geom_linerange(aes(xmin = conf.low, xmax = conf.high)) +
    geom_point(aes(color = term)) +
    theme_bw() +
    theme(legend.position = "none") +
    xlim(-1, 6) +
    labs(x = "Hazard ratio")
## Warning: Removed 9 rows containing missing values (geom_segment).
## Warning: Removed 6 rows containing missing values (geom_point).

ANGOR

bbdd_clusters_10 %>% group_by(sex_female) %>% group_split %>% map(function(data){
  autoplot(survfit(Surv(t.ep_Angor, i.ep_Angor) ~ clusters, data=data), main="Angor survival", censor.size = 0, surv.geom = "line",
           conf.int.fill=NULL)
})
## [[1]]

## 
## [[2]]

summary(as.factor(bbdd_clusters_10$clusters))
##   1   2   3   4   5   6   7   8   9  10 
## 199 198 249 286 299 211 260 209 140 163
bbdd_clusters_10$clusters <- as.factor(bbdd_clusters_10$clusters)

clusters_men <- bbdd_clusters_10 %>% filter(clusters != 9, sex_female=="Men") #FILTRO CUANDO NO APARECE EVENTO
clusters_men$clusters <- relevel(clusters_men$clusters, 7) #REFERENCIA MAYOR SUPERVIVENCIA

clusters_women <- bbdd_clusters_10 %>% filter(sex_female=="Women")
clusters_women$clusters <- relevel(clusters_women$clusters, ref = 8)

clusters = list(clusters_men, clusters_women)

bbdd_clusters_10_surv <- clusters %>%
  map(function(data) {mod = coxph(Surv(t.ep_Angor, i.ep_Angor)~age + BMI+ clusters, data=data)
      tidy(mod, conf.int= TRUE, exponentiate=TRUE)})
head(bbdd_clusters_10_surv)
## [[1]]
## # A tibble: 11 x 7
##    term       estimate std.error statistic p.value conf.low conf.high
##    <chr>         <dbl>     <dbl>     <dbl>   <dbl>    <dbl>     <dbl>
##  1 age            1.03    0.0170     1.57   0.116     0.993      1.06
##  2 BMI            1.08    0.0350     2.16   0.0305    1.01       1.16
##  3 clusters1      6.76    1.10       1.74   0.0820    0.785     58.2 
##  4 clusters2      6.19    1.16       1.58   0.114     0.644     59.6 
##  5 clusters3      4.36    1.07       1.38   0.169     0.535     35.5 
##  6 clusters4      3.23    1.12       1.05   0.294     0.361     29.0 
##  7 clusters5      4.44    1.16       1.29   0.197     0.460     42.9 
##  8 clusters6      3.40    1.23       0.997  0.319     0.306     37.8 
##  9 clusters8      1.97    1.23       0.554  0.579     0.179     21.8 
## 10 clusters9     NA       0         NA     NA        NA         NA   
## 11 clusters10     8.23    1.10       1.92   0.0544    0.961     70.5 
## 
## [[2]]
## # A tibble: 11 x 7
##    term       estimate std.error statistic p.value conf.low conf.high
##    <chr>         <dbl>     <dbl>     <dbl>   <dbl>    <dbl>     <dbl>
##  1 age           1.02     0.0192     0.846   0.398   0.979       1.06
##  2 BMI           0.968    0.0370    -0.865   0.387   0.901       1.04
##  3 clusters1     3.42     1.12       1.10    0.273   0.379      30.8 
##  4 clusters2     2.16     1.16       0.664   0.506   0.223      20.9 
##  5 clusters3     1.28     1.42       0.173   0.862   0.0798     20.5 
##  6 clusters4     0.663    1.42      -0.291   0.771   0.0414     10.6 
##  7 clusters5     1.98     1.10       0.625   0.532   0.232      17.0 
##  8 clusters6     0.570    1.42      -0.396   0.692   0.0354      9.18
##  9 clusters7     2.51     1.12       0.823   0.411   0.280      22.6 
## 10 clusters9     2.38     1.23       0.708   0.479   0.216      26.3 
## 11 clusters10    2.80     1.15       0.890   0.373   0.291      26.9
options(repr.plot.width = 9, repr.plot.height = 3)
bbdd_clusters_10_surv[[1]] %>%
    ggplot(aes(estimate, term)) +
    geom_vline(xintercept = 1, lty = "dashed") +
    geom_linerange(aes(xmin = conf.low, xmax = conf.high)) +
    geom_point(aes(color = term)) +
    theme_bw() +
    theme(legend.position = "none") +
    xlim(-1,6) +
    labs(x = "Hazard ratio")
## Warning: Removed 9 rows containing missing values (geom_segment).
## Warning: Removed 4 rows containing missing values (geom_point).

options(repr.plot.width = 9, repr.plot.height = 3)
bbdd_clusters_10_surv[[2]] %>%
    ggplot(aes(estimate, term)) +
    geom_vline(xintercept = 1, lty = "dashed") +
    geom_linerange(aes(xmin = conf.low, xmax = conf.high)) +
    geom_point(aes(color = term)) +
    theme_bw() +
    theme(legend.position = "none") +
    xlim(-1, 6) +
    labs(x = "Hazard ratio")
## Warning: Removed 9 rows containing missing values (geom_segment).

DISTRIBUCIÓN BMI Y WEIGHT EN CLUSTERS

6 CLUSTERS

bbdd_clusters_6 %>% group_by(sex_female, .add=TRUE) %>% group_split() %>% map(function(dat){
  boxplot(dat$BMI~dat$clusters,col=viridis(8), main=paste("BMI DISTRIBUTION IN", if_else(dat$sex_female[1]=="Men", "MALE", "FEMALE")),
          xlab="CLUSTER", ylab="BMI")
})

## [[1]]
## [[1]]$stats
##       [,1]  [,2]   [,3]   [,4]  [,5]   [,6]
## [1,] 19.02 20.32 19.270 20.440 20.55 22.130
## [2,] 26.81 25.93 26.500 27.115 26.40 27.380
## [3,] 29.77 28.74 28.795 29.550 28.45 29.620
## [4,] 33.25 31.43 31.400 32.715 31.91 32.915
## [5,] 41.80 38.40 38.600 40.520 40.16 38.970
## 
## [[1]]$n
## [1] 158 101 282 123 166  67
## 
## [[1]]$conf
##         [,1]     [,2]     [,3]    [,4]    [,5]     [,6]
## [1,] 28.9605 27.87531 28.33397 28.7522 27.7743 28.55159
## [2,] 30.5795 29.60469 29.25603 30.3478 29.1257 30.68841
## 
## [[1]]$out
##  [1] 43.25 40.64 40.04 41.08 44.69 49.96 39.68 42.31 40.12 42.72 53.52 41.27
## [13] 16.31 42.61 42.07 41.87 51.53 45.88 49.45 41.74
## 
## [[1]]$group
##  [1] 1 2 2 3 3 3 3 3 3 4 4 4 4 5 5 5 5 6 6 6
## 
## [[1]]$names
## [1] "1" "2" "3" "4" "5" "6"
## 
## 
## [[2]]
## [[2]]$stats
##        [,1]   [,2]   [,3]  [,4]   [,5]   [,6]
## [1,] 19.740 19.560 19.010 16.75 19.110 18.270
## [2,] 27.225 27.430 26.910 27.09 27.080 27.590
## [3,] 30.935 31.385 30.900 30.80 30.330 32.060
## [4,] 34.010 35.430 34.855 35.54 35.515 36.345
## [5,] 43.420 47.070 45.920 47.87 47.250 49.050
## 
## [[2]]$n
## [1] 152 178 156 367 300 164
## 
## [[2]]$conf
##          [,1]     [,2]     [,3]     [,4]     [,5]     [,6]
## [1,] 30.06547 30.43759 29.89495 30.10308 29.56055 30.97983
## [2,] 31.80453 32.33241 31.90505 31.49692 31.09945 33.14017
## 
## [[2]]$out
##  [1] 56.14 14.00 51.86 44.38 51.02 49.10 44.60 49.34 49.95 51.93 52.34 51.47
## [13] 49.49 50.47 49.73 50.94 53.95 48.91 52.27 59.14 54.65 51.86
## 
## [[2]]$group
##  [1] 1 1 1 1 1 1 1 2 2 3 3 4 4 4 4 4 4 4 5 5 6 6
## 
## [[2]]$names
## [1] "1" "2" "3" "4" "5" "6"
bbdd_clusters_6 %>% group_by(sex_female, .add=TRUE) %>% group_split() %>% map2(bbdd_covar %>% group_by(sex_female) %>% group_split,
                                                                               function(dat, weight){
  dat <- dat %>% left_join(weight %>% select(weight, rowId), by="rowId")
  boxplot(dat$weight~dat$clusters,col=viridis(8), main=paste("WEIGTH DISTRIBUTION IN", if_else(dat$sex_female[1]=="Men", "MALE", "FEMALE")),
          xlab="CLUSTER", ylab="BMI")
})

## [[1]]
## [[1]]$stats
##      [,1]  [,2]  [,3]  [,4]  [,5]   [,6]
## [1,]   57  52.0  51.2  44.4  52.0  61.50
## [2,]   75  70.8  72.0  73.9  73.0  74.75
## [3,]   85  80.0  81.0  83.0  80.0  84.90
## [4,]   97  89.0  89.5  94.0  90.9  95.95
## [5,]  125 113.0 115.2 122.1 112.7 116.40
## 
## [[1]]$n
## [1] 158 101 282 123 166  67
## 
## [[1]]$conf
##          [,1]     [,2]     [,3]     [,4]     [,5]     [,6]
## [1,] 82.23464 77.13867 79.35347 80.13648 77.80489 80.80781
## [2,] 87.76536 82.86133 82.64653 85.86352 82.19511 88.99219
## 
## [[1]]$out
##  [1] 130.0 140.0 118.2 153.0 118.0 126.3 156.5 125.0 128.0 132.0 121.3 133.3
## [13] 121.0 156.0 123.0 138.9 133.0
## 
## [[1]]$group
##  [1] 2 3 3 3 3 4 4 4 5 5 5 5 5 5 5 6 6
## 
## [[1]]$names
## [1] "1" "2" "3" "4" "5" "6"
## 
## 
## [[2]]
## [[2]]$stats
##        [,1]  [,2]   [,3]   [,4]   [,5]   [,6]
## [1,]  41.50  44.6  44.50  38.30  38.50  35.80
## [2,]  64.00  64.2  63.10  63.85  64.00  65.35
## [3,]  74.00  71.5  72.75  72.80  71.50  76.80
## [4,]  82.75  87.5  85.00  84.25  84.05  87.35
## [5,] 109.00 122.0 112.50 113.00 114.00 116.90
## 
## [[2]]$n
## [1] 152 178 156 367 300 164
## 
## [[2]]$conf
##         [,1]     [,2]     [,3]    [,4]     [,5]    [,6]
## [1,] 71.5971 68.74067 69.97962 71.1175 69.67101 74.0857
## [2,] 76.4029 74.25933 75.52038 74.4825 73.32899 79.5143
## 
## [[2]]$out
##  [1] 128.0  31.5 123.0 130.6 126.3 128.0 134.0 126.5 128.5 122.0 126.0 115.0
## [13] 156.0 124.3 115.0 114.5 121.5 121.0 119.0 161.0 121.0 128.6 150.6 122.0
## 
## [[2]]$group
##  [1] 1 1 1 1 2 3 3 4 4 4 4 4 4 4 4 5 5 5 5 5 5 6 6 6
## 
## [[2]]$names
## [1] "1" "2" "3" "4" "5" "6"

7 CLUSTERS

bbdd_clusters_7 %>% group_by(sex_female, .add=TRUE) %>% group_split() %>% map(function(dat){
  boxplot(dat$BMI~dat$clusters,col=viridis(8), main=paste("BMI DISTRIBUTION IN", if_else(dat$sex_female[1]=="Men", "MALE", "FEMALE")),
          xlab="CLUSTER", ylab="BMI")
})

## [[1]]
## [[1]]$stats
##        [,1]  [,2]   [,3]  [,4]  [,5]  [,6]   [,7]
## [1,] 20.550 19.27 20.060 20.44 19.02 22.13 21.160
## [2,] 25.970 26.34 26.765 27.09 26.81 27.28 25.930
## [3,] 27.895 28.75 29.380 29.58 29.52 29.62 28.385
## [4,] 30.860 31.91 31.595 33.13 32.60 32.35 32.100
## [5,] 37.550 40.14 38.600 41.27 40.37 38.97 40.120
## 
## [[1]]$n
## [1]  98  90 247  93 162  69 138
## 
## [[1]]$conf
##          [,1]     [,2]     [,3]     [,4]     [,5]     [,6]     [,7]
## [1,] 27.11454 27.82234 28.89443 28.59042 28.80125 28.65564 27.55514
## [2,] 28.67546 29.67766 29.86557 30.56958 30.23875 30.58436 29.21486
## 
## [[1]]$out
##  [1] 42.07 41.87 38.63 51.53 40.16 40.64 42.61 41.08 44.69 49.96 39.68 42.72
## [13] 53.52 16.31 43.25 41.80 45.88 49.45 41.74 42.31
## 
## [[1]]$group
##  [1] 1 1 1 1 1 2 3 3 3 3 3 4 4 4 5 5 6 6 6 7
## 
## [[1]]$names
## [1] "1" "2" "3" "4" "5" "6" "7"
## 
## 
## [[2]]
## [[2]]$stats
##        [,1]  [,2]   [,3]   [,4]   [,5]   [,6]   [,7]
## [1,] 19.720 19.56 19.110 17.970 16.750 18.270 19.010
## [2,] 26.705 27.41 28.055 26.860 27.595 27.685 27.040
## [3,] 30.850 31.26 31.400 30.235 30.800 31.945 31.105
## [4,] 34.705 34.96 36.235 34.550 36.255 35.840 34.720
## [5,] 45.920 46.09 44.600 45.610 47.250 45.700 44.220
## 
## [[2]]$n
## [1] 243 187 103 262 284 144  94
## 
## [[2]]$conf
##          [,1]     [,2]     [,3]     [,4]     [,5]     [,6]     [,7]
## [1,] 30.03914 30.38767 30.12652 29.48436 29.98808 30.87126 29.85343
## [2,] 31.66086 32.13233 32.67348 30.98564 31.61192 33.01874 32.35657
## 
## [[2]]$out
##  [1] 51.47 14.00 51.86 51.93 50.94 52.34 47.36 51.02 49.10 48.91 49.34 56.14
## [13] 47.07 49.95 46.66 46.66 49.73 47.87 53.95 52.27 49.49 50.47 59.14 54.65
## [25] 49.05 51.86 48.87 46.80
## 
## [[2]]$group
##  [1] 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 4 4 4 4 5 5 5 5 6 6 6 6 7
## 
## [[2]]$names
## [1] "1" "2" "3" "4" "5" "6" "7"
bbdd_clusters_7 %>% group_by(sex_female, .add=TRUE) %>% group_split() %>% map2(bbdd_covar %>% group_by(sex_female) %>% group_split,
                                                                               function(dat, weight){
  dat <- dat %>% left_join(weight %>% select(weight, rowId), by="rowId")
  boxplot(dat$weight~dat$clusters,col=viridis(8), main=paste("WEIGTH DISTRIBUTION IN", if_else(dat$sex_female[1]=="Men", "MALE", "FEMALE")),
          xlab="CLUSTER", ylab="BMI")
})

## [[1]]
## [[1]]$stats
##        [,1]  [,2]   [,3]  [,4]  [,5]  [,6]  [,7]
## [1,]  52.00  51.2  52.00  44.4  61.1  59.2  54.0
## [2,]  70.00  71.7  74.65  75.3  75.0  74.5  70.8
## [3,]  77.45  79.9  82.00  84.1  83.9  84.0  81.0
## [4,]  88.70  91.0  90.00  97.0  94.5  95.5  90.0
## [5,] 112.70 116.0 112.00 126.3 121.3 116.4 118.0
## 
## [[1]]$n
## [1]  98  90 247  93 162  69 138
## 
## [[1]]$conf
##         [,1]     [,2]     [,3]     [,4]     [,5]    [,6]     [,7]
## [1,] 74.4654 76.68565 80.45682 80.54471 81.47934 80.0056 78.41763
## [2,] 80.4346 83.11435 83.54318 87.65529 86.32066 87.9944 83.58237
## 
## [[1]]$out
##  [1] 133.3 121.0 156.0 123.0 130.0 128.0 132.0 113.5 115.0 140.0 118.2 153.0
## [13] 114.0 156.5 125.0 138.9 133.0
## 
## [[1]]$group
##  [1] 1 1 1 1 2 3 3 3 3 3 3 3 3 4 5 6 6
## 
## [[1]]$names
## [1] "1" "2" "3" "4" "5" "6" "7"
## 
## 
## [[2]]
## [[2]]$stats
##       [,1]   [,2]   [,3]   [,4]   [,5]   [,6]  [,7]
## [1,]  31.5  44.60  43.00  38.30  44.50  35.80  38.5
## [2,]  62.7  64.30  67.55  63.00  65.00  65.35  62.0
## [3,]  73.0  71.50  75.00  70.25  74.50  77.25  71.7
## [4,]  84.0  85.75  84.60  81.00  85.85  87.00  83.0
## [5,] 113.0 115.00 108.00 107.80 114.50 116.90 112.2
## 
## [[2]]$n
## [1] 243 187 103 262 284 144  94
## 
## [[2]]$conf
##          [,1]     [,2]     [,3]     [,4]     [,5]     [,6]     [,7]
## [1,] 70.84109 69.02164 72.34562 68.49297 72.54519 74.39942 68.27774
## [2,] 75.15891 73.97836 77.65438 72.00703 76.45481 80.10058 75.12226
## 
## [[2]]$out
##  [1] 128.5 123.0 128.0 156.0 134.0 124.3 130.6 126.3 128.0 122.0 120.0 128.6
## [13]  41.5 126.5 109.9 113.3 115.0 115.0 122.0 121.5 126.0 119.0 161.0 121.0
## [25] 150.6 121.0 122.0
## 
## [[2]]$group
##  [1] 1 1 1 1 1 1 1 2 2 2 2 3 3 4 4 4 4 4 5 5 5 5 5 5 6 6 6
## 
## [[2]]$names
## [1] "1" "2" "3" "4" "5" "6" "7"

8 CLUSTERS

bbdd_clusters_8 %>% group_by(sex_female, .add=TRUE) %>% group_split() %>% map(function(dat){
  boxplot(dat$BMI~dat$clusters,col=viridis(8), main=paste("BMI DISTRIBUTION IN", if_else(dat$sex_female[1]=="Men", "MALE", "FEMALE")),
          xlab="CLUSTER", ylab="BMI")
})

## [[1]]
## [[1]]$stats
##       [,1]   [,2]   [,3]  [,4]  [,5]  [,6]  [,7]   [,8]
## [1,] 21.55 20.320 22.210 20.44 20.55 22.13 19.27 19.020
## [2,] 26.09 26.480 27.435 26.70 26.42 27.38 25.91 26.725
## [3,] 28.25 29.380 29.640 28.68 28.64 30.09 28.30 29.840
## [4,] 30.99 32.215 32.035 32.45 32.65 32.57 31.48 32.500
## [5,] 36.85 40.640 38.600 39.93 40.16 38.97 39.68 40.370
## 
## [[1]]$n
## [1]  65  75 151  82 106  63 207 148
## 
## [[1]]$conf
##          [,1]     [,2]     [,3]     [,4]     [,5]     [,6]     [,7]     [,8]
## [1,] 27.28972 28.33369 29.04854 27.67673 27.68392 29.05687 27.68832 29.08997
## [2,] 29.21028 30.42631 30.23146 29.68327 29.59608 31.12313 28.91168 30.59003
## 
## [[1]]$out
##  [1] 41.87 40.52 41.08 40.15 39.12 41.27 40.14 42.72 53.52 42.61 42.07 51.53
## [13] 45.88 49.45 41.74 44.69 49.96 42.31 40.12 16.31 43.25 41.80
## 
## [[1]]$group
##  [1] 1 1 3 3 3 3 3 4 4 5 5 5 6 6 6 7 7 7 7 7 8 8
## 
## [[1]]$names
## [1] "1" "2" "3" "4" "5" "6" "7" "8"
## 
## 
## [[2]]
## [[2]]$stats
##       [,1]   [,2]  [,3]   [,4]  [,5]  [,6]  [,7]  [,8]
## [1,] 14.00 19.560 19.74 17.970 16.75 20.45 18.27 19.02
## [2,] 26.62 27.890 28.20 26.745 27.56 28.20 26.83 27.61
## [3,] 30.83 31.495 31.43 30.240 30.70 32.05 31.11 31.84
## [4,] 35.49 35.210 34.19 35.050 35.96 35.56 34.30 36.21
## [5,] 47.36 44.850 42.06 46.660 47.25 44.36 45.06 48.87
## 
## [[2]]$n
## [1] 261 162  93 184 257 113 134 113
## 
## [[2]]$conf
##          [,1]     [,2]     [,3]     [,4]     [,5]     [,6]     [,7]     [,8]
## [1,] 29.96252 30.58632 30.44861 29.27264 29.87212 30.95605 30.09041 30.56175
## [2,] 31.69748 32.40368 32.41139 31.20736 31.52788 33.14395 32.12959 33.11825
## 
## [[2]]$out
##  [1] 51.47 51.86 51.93 50.94 52.34 59.14 49.10 48.91 49.34 56.14 46.80 49.95
## [13] 43.47 44.38 19.11 44.60 47.87 53.95 52.27 49.49 50.47 49.73 51.02 46.73
## [25] 49.05 45.92 54.65 51.86
## 
## [[2]]$group
##  [1] 1 1 1 1 1 1 1 1 2 2 2 2 3 3 3 3 4 4 5 5 5 5 5 6 7 7 8 8
## 
## [[2]]$names
## [1] "1" "2" "3" "4" "5" "6" "7" "8"
bbdd_clusters_8 %>% group_by(sex_female) %>% group_split() %>% map(function(dat){
group_by(dat, clusters) %>%
  summarise(
    count = n(),
    mean = mean(BMI, na.rm = TRUE),
    sd = sd(BMI, na.rm = TRUE)
  )})
## [[1]]
## # A tibble: 8 x 4
##   clusters count  mean    sd
##   <fct>    <int> <dbl> <dbl>
## 1 1           65  28.7  3.98
## 2 2           75  29.4  4.21
## 3 3          151  29.8  3.90
## 4 4           82  30.1  5.29
## 5 5          106  29.7  5.06
## 6 6           63  30.7  5.00
## 7 7          207  28.9  4.67
## 8 8          148  30.0  4.32
## 
## [[2]]
## # A tibble: 8 x 4
##   clusters count  mean    sd
##   <fct>    <int> <dbl> <dbl>
## 1 1          261  31.6  6.75
## 2 2          162  32.0  6.07
## 3 3           93  31.7  5.49
## 4 4          184  31.3  6.23
## 5 5          257  31.9  6.17
## 6 6          113  31.9  5.52
## 7 7          134  31.1  5.84
## 8 8          113  32.5  6.60
bbdd_clusters_8 %>% group_by(sex_female, .add=TRUE) %>% group_split() %>% map(function(dat){
  res.aov <- aov(BMI ~ clusters, data = dat)
  print(summary(res.aov))
  print(TukeyHSD(res.aov))
  })
##              Df Sum Sq Mean Sq F value Pr(>F)  
## clusters      7    270   38.62   1.872  0.071 .
## Residuals   889  18337   20.63                 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##   Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = BMI ~ clusters, data = dat)
## 
## $clusters
##            diff        lwr       upr     p adj
## 2-1  0.68571282 -1.6525874 3.0240131 0.9868721
## 3-1  1.12944880 -0.9174920 3.1763896 0.7026795
## 4-1  1.36813884 -0.9233547 3.6596323 0.6107179
## 5-1  1.02163861 -1.1521235 3.1954007 0.8442867
## 6-1  1.95063980 -0.4888646 4.3901442 0.2283958
## 7-1  0.21631958 -1.7455321 2.1781712 0.9999773
## 8-1  1.27234615 -0.7808282 3.3255205 0.5632104
## 3-2  0.44373598 -1.5054742 2.3929462 0.9972157
## 4-2  0.68242602 -1.5222046 2.8870566 0.9820009
## 5-2  0.33592579 -1.7460673 2.4179189 0.9997033
## 6-2  1.26492698 -1.0931729 3.6230269 0.7321970
## 7-2 -0.46939324 -2.3290480 1.3902615 0.9946933
## 8-2  0.58663333 -1.3691219 2.5423886 0.9850100
## 4-3  0.23869003 -1.6541161 2.1314961 0.9999434
## 5-3 -0.10781020 -1.8562421 1.6406217 0.9999996
## 6-3  0.82119100 -1.2483389 2.8907209 0.9303552
## 7-3 -0.91312922 -2.3898264 0.5635679 0.5660263
## 8-3  0.14289735 -1.4531282 1.7389229 0.9999946
## 5-4 -0.34650023 -2.3757833 1.6827828 0.9995680
## 6-4  0.58250097 -1.7291931 2.8941951 0.9947486
## 7-4 -1.15181925 -2.9522664 0.6486279 0.5210666
## 8-4 -0.09579268 -1.9953382 1.8037528 0.9999999
## 6-5  0.92900120 -1.2660452 3.1240476 0.9041321
## 7-5 -0.80531902 -2.4533206 0.8426826 0.8157250
## 8-5  0.25070755 -1.5050180 2.0064331 0.9998690
## 7-6 -1.73432022 -3.7197293 0.2510888 0.1384583
## 8-6 -0.67829365 -2.7539892 1.3974019 0.9754691
## 8-7  1.05602657 -0.4292992 2.5413523 0.3772698
## 
##               Df Sum Sq Mean Sq F value Pr(>F)
## clusters       7    191   27.31   0.711  0.663
## Residuals   1309  50259   38.39               
##   Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = BMI ~ clusters, data = dat)
## 
## $clusters
##            diff       lwr      upr     p adj
## 2-1  0.47494679 -1.406553 2.356446 0.9947271
## 3-1  0.08974169 -2.181962 2.361445 1.0000000
## 4-1 -0.28961373 -2.100381 1.521153 0.9997223
## 5-1  0.35824217 -1.294822 2.011307 0.9979781
## 6-1  0.29871495 -1.819588 2.417018 0.9998805
## 7-1 -0.47246983 -2.471584 1.526644 0.9965002
## 8-1  0.91836097 -1.199942 3.036664 0.8928930
## 3-2 -0.38520510 -2.832480 2.062070 0.9997504
## 4-2 -0.76456052 -2.791232 1.262111 0.9466117
## 5-2 -0.11670462 -2.003804 1.770394 0.9999996
## 6-2 -0.17623184 -2.481818 2.129354 0.9999982
## 7-2 -0.94741662 -3.144000 1.249167 0.8954820
## 8-2  0.44341418 -1.862172 2.749000 0.9990640
## 4-3 -0.37935542 -2.772678 2.013967 0.9997385
## 5-3  0.26850048 -2.007843 2.544844 0.9999644
## 6-3  0.20897326 -2.424717 2.842663 0.9999977
## 7-3 -0.56221152 -3.101025 1.976602 0.9976798
## 8-3  0.82861928 -1.805071 3.462309 0.9803526
## 5-4  0.64785590 -1.168728 2.464440 0.9604330
## 6-4  0.58832868 -1.659907 2.836564 0.9934216
## 7-4 -0.18285610 -2.319165 1.953452 0.9999961
## 8-4  1.20797470 -1.040261 3.456210 0.7313511
## 6-5 -0.05952722 -2.182805 2.063751 1.0000000
## 7-5 -0.83071200 -2.835097 1.173673 0.9138581
## 8-5  0.56011880 -1.563159 2.683397 0.9930902
## 7-6 -0.77118478 -3.173714 1.631344 0.9779674
## 8-6  0.61964602 -1.882931 3.122223 0.9953209
## 8-7  1.39083080 -1.011698 3.793360 0.6491455
## [[1]]
##   Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = BMI ~ clusters, data = dat)
## 
## $clusters
##            diff        lwr       upr     p adj
## 2-1  0.68571282 -1.6525874 3.0240131 0.9868721
## 3-1  1.12944880 -0.9174920 3.1763896 0.7026795
## 4-1  1.36813884 -0.9233547 3.6596323 0.6107179
## 5-1  1.02163861 -1.1521235 3.1954007 0.8442867
## 6-1  1.95063980 -0.4888646 4.3901442 0.2283958
## 7-1  0.21631958 -1.7455321 2.1781712 0.9999773
## 8-1  1.27234615 -0.7808282 3.3255205 0.5632104
## 3-2  0.44373598 -1.5054742 2.3929462 0.9972157
## 4-2  0.68242602 -1.5222046 2.8870566 0.9820009
## 5-2  0.33592579 -1.7460673 2.4179189 0.9997033
## 6-2  1.26492698 -1.0931729 3.6230269 0.7321970
## 7-2 -0.46939324 -2.3290480 1.3902615 0.9946933
## 8-2  0.58663333 -1.3691219 2.5423886 0.9850100
## 4-3  0.23869003 -1.6541161 2.1314961 0.9999434
## 5-3 -0.10781020 -1.8562421 1.6406217 0.9999996
## 6-3  0.82119100 -1.2483389 2.8907209 0.9303552
## 7-3 -0.91312922 -2.3898264 0.5635679 0.5660263
## 8-3  0.14289735 -1.4531282 1.7389229 0.9999946
## 5-4 -0.34650023 -2.3757833 1.6827828 0.9995680
## 6-4  0.58250097 -1.7291931 2.8941951 0.9947486
## 7-4 -1.15181925 -2.9522664 0.6486279 0.5210666
## 8-4 -0.09579268 -1.9953382 1.8037528 0.9999999
## 6-5  0.92900120 -1.2660452 3.1240476 0.9041321
## 7-5 -0.80531902 -2.4533206 0.8426826 0.8157250
## 8-5  0.25070755 -1.5050180 2.0064331 0.9998690
## 7-6 -1.73432022 -3.7197293 0.2510888 0.1384583
## 8-6 -0.67829365 -2.7539892 1.3974019 0.9754691
## 8-7  1.05602657 -0.4292992 2.5413523 0.3772698
## 
## 
## [[2]]
##   Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = BMI ~ clusters, data = dat)
## 
## $clusters
##            diff       lwr      upr     p adj
## 2-1  0.47494679 -1.406553 2.356446 0.9947271
## 3-1  0.08974169 -2.181962 2.361445 1.0000000
## 4-1 -0.28961373 -2.100381 1.521153 0.9997223
## 5-1  0.35824217 -1.294822 2.011307 0.9979781
## 6-1  0.29871495 -1.819588 2.417018 0.9998805
## 7-1 -0.47246983 -2.471584 1.526644 0.9965002
## 8-1  0.91836097 -1.199942 3.036664 0.8928930
## 3-2 -0.38520510 -2.832480 2.062070 0.9997504
## 4-2 -0.76456052 -2.791232 1.262111 0.9466117
## 5-2 -0.11670462 -2.003804 1.770394 0.9999996
## 6-2 -0.17623184 -2.481818 2.129354 0.9999982
## 7-2 -0.94741662 -3.144000 1.249167 0.8954820
## 8-2  0.44341418 -1.862172 2.749000 0.9990640
## 4-3 -0.37935542 -2.772678 2.013967 0.9997385
## 5-3  0.26850048 -2.007843 2.544844 0.9999644
## 6-3  0.20897326 -2.424717 2.842663 0.9999977
## 7-3 -0.56221152 -3.101025 1.976602 0.9976798
## 8-3  0.82861928 -1.805071 3.462309 0.9803526
## 5-4  0.64785590 -1.168728 2.464440 0.9604330
## 6-4  0.58832868 -1.659907 2.836564 0.9934216
## 7-4 -0.18285610 -2.319165 1.953452 0.9999961
## 8-4  1.20797470 -1.040261 3.456210 0.7313511
## 6-5 -0.05952722 -2.182805 2.063751 1.0000000
## 7-5 -0.83071200 -2.835097 1.173673 0.9138581
## 8-5  0.56011880 -1.563159 2.683397 0.9930902
## 7-6 -0.77118478 -3.173714 1.631344 0.9779674
## 8-6  0.61964602 -1.882931 3.122223 0.9953209
## 8-7  1.39083080 -1.011698 3.793360 0.6491455
bbdd_clusters_8 %>% group_by(sex_female, .add=TRUE) %>% group_split() %>% map2(bbdd_covar %>% group_by(sex_female) %>% group_split,
                                                                               function(dat, weight){
  dat <- dat %>% left_join(weight %>% select(weight, rowId), by="rowId")
  boxplot(dat$weight~dat$clusters,col=viridis(8), main=paste("WEIGTH DISTRIBUTION IN", if_else(dat$sex_female[1]=="Men", "MALE", "FEMALE")),
          xlab="CLUSTER", ylab="BMI")
})

## [[1]]
## [[1]]$stats
##       [,1]   [,2]   [,3]   [,4]   [,5]   [,6]   [,7]  [,8]
## [1,]  55.5  52.00  58.00  56.50  58.00  61.50  44.40  57.0
## [2,]  70.5  72.75  75.55  73.00  73.50  74.75  70.00  75.5
## [3,]  78.8  80.50  83.50  82.85  80.15  84.00  79.00  83.9
## [4,]  88.7  91.10  90.45  91.80  93.50  95.95  88.75  94.0
## [5,] 113.0 113.00 110.50 119.50 123.00 116.40 115.20 121.3
## 
## [[1]]$n
## [1]  65  75 151  82 106  63 207 148
## 
## [[1]]$conf
##          [,1]     [,2]     [,3]     [,4]     [,5]    [,6]     [,7]     [,8]
## [1,] 75.23326 77.15218 81.58418 79.56974 77.08074 79.7799 76.94092 81.49731
## [2,] 82.36674 83.84782 85.41582 86.13026 83.21926 88.2201 81.05908 86.30269
## 
## [[1]]$out
##  [1] 121.0 130.0 121.0 118.2  52.0 114.4 122.1 116.0 125.0 114.0 126.3 156.5
## [13] 128.0 132.0 133.3 156.0 138.9 133.0 140.0 153.0 118.0 125.0
## 
## [[1]]$group
##  [1] 1 2 3 3 3 3 3 3 3 3 4 4 5 5 5 5 6 6 7 7 7 8
## 
## [[1]]$names
## [1] "1" "2" "3" "4" "5" "6" "7" "8"
## 
## 
## [[2]]
## [[2]]$stats
##       [,1]  [,2]  [,3]   [,4]  [,5]  [,6]  [,7]  [,8]
## [1,]  31.5  44.6  41.5  38.30  44.5  38.5  35.8  45.1
## [2,]  63.0  64.2  65.0  62.75  65.0  65.5  62.8  66.5
## [3,]  71.8  73.5  75.0  71.00  73.5  74.0  70.0  77.0
## [4,]  84.0  87.5  85.0  83.00  86.0  83.6  82.4  87.0
## [5,] 115.0 120.0 108.0 109.90 114.5 102.5 107.5 116.9
## 
## [[2]]$n
## [1] 261 162  93 184 257 113 134 113
## 
## [[2]]$conf
##          [,1]     [,2]     [,3]    [,4]     [,5]     [,6]     [,7]     [,8]
## [1,] 69.74621 70.60762 71.72323 68.6413 71.43029 71.30973 67.32477 73.95301
## [2,] 73.85379 76.39238 78.27677 73.3587 75.56971 76.69027 72.67523 80.04699
## 
## [[2]]$out
##  [1] 128.5 123.0 122.0 128.0 156.0 134.0 124.3 161.0 126.3 128.0 128.6 126.5
## [13] 115.0 115.0 122.0 121.5 126.0 119.0 130.6 121.0 112.2 111.0 114.0 121.0
## [25] 112.5 150.6 122.0
## 
## [[2]]$group
##  [1] 1 1 1 1 1 1 1 1 2 2 3 4 4 4 5 5 5 5 5 5 6 6 7 7 7 8 8
## 
## [[2]]$names
## [1] "1" "2" "3" "4" "5" "6" "7" "8"
bbdd_clusters_8 %>% group_by(sex_female, .add=TRUE) %>% group_split() %>% map2(bbdd_covar %>% group_by(sex_female) %>% group_split,
                                                                               function(dat, weight){
group_by(dat, clusters) %>% left_join(weight %>% select(weight, rowId), by="rowId") %>% 
  summarise(
    count = n(),
    mean = mean(weight, na.rm = TRUE),
    sd = sd(BMI, na.rm = TRUE)
  )})
## [[1]]
## # A tibble: 8 x 4
##   clusters count  mean    sd
##   <fct>    <int> <dbl> <dbl>
## 1 1           65  80.5  3.98
## 2 2           75  82.6  4.21
## 3 3          151  84.4  3.90
## 4 4           82  84.1  5.29
## 5 5          106  84.4  5.06
## 6 6           63  87.3  5.00
## 7 7          207  81.1  4.67
## 8 8          148  85.4  4.32
## 
## [[2]]
## # A tibble: 8 x 4
##   clusters count  mean    sd
##   <fct>    <int> <dbl> <dbl>
## 1 1          261  75.3  6.75
## 2 2          162  76.1  6.07
## 3 3           93  75.9  5.49
## 4 4          184  73.7  6.23
## 5 5          257  76.3  6.17
## 6 6          113  75.1  5.52
## 7 7          134  73.2  5.84
## 8 8          113  77.2  6.60
bbdd_clusters_8 %>% group_by(sex_female, .add=TRUE) %>% group_split() %>% map2(bbdd_covar %>% group_by(sex_female) %>% group_split,
                                                                               function(dat, weight){
dat <- dat %>% left_join(weight %>% select(weight, rowId), by="rowId")
  res.aov <- aov(weight~ clusters, data = dat)
  print(summary(res.aov))
  print(TukeyHSD(res.aov))
                                                                               })
##              Df Sum Sq Mean Sq F value Pr(>F)  
## clusters      7   3502   500.3   2.218 0.0308 *
## Residuals   889 200505   225.5                 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##   Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = weight ~ clusters, data = dat)
## 
## $clusters
##           diff         lwr        upr     p adj
## 2-1  2.1591282  -5.5729629  9.8912193 0.9901872
## 3-1  3.8687927  -2.8998561 10.6374414 0.6631978
## 4-1  3.6648030  -3.9125116 11.2421176 0.8236326
## 5-1  3.8863861  -3.3016244 11.0743965 0.7241214
## 6-1  6.8263980  -1.2403462 14.8931423 0.1679714
## 7-1  0.6174954  -5.8697878  7.1047785 0.9999917
## 8-1  4.9035967  -1.8856646 11.6928579 0.3560847
## 3-2  1.7096645  -4.7358171  8.1551460 0.9928068
## 4-2  1.5056748  -5.7844091  8.7957586 0.9985036
## 5-2  1.7272579  -5.1572988  8.6118145 0.9948876
## 6-2  4.6672698  -3.1302932 12.4648328 0.6075796
## 7-2 -1.5416329  -7.6909799  4.6077142 0.9949115
## 8-2  2.7444685  -3.7226557  9.2115926 0.9028709
## 4-3 -0.2039897  -6.4629589  6.0549796 1.0000000
## 5-3  0.0175934  -5.7639716  5.7991584 1.0000000
## 6-3  2.9576054  -3.8857390  9.8009497 0.8939945
## 7-3 -3.2512973  -8.1343130  1.6317184 0.4668432
## 8-3  1.0348040  -4.2427966  6.3124046 0.9989307
## 5-4  0.2215831  -6.4886765  6.9318427 1.0000000
## 6-4  3.1615950  -4.4825172 10.8057073 0.9143170
## 7-4 -3.0473076  -9.0008723  2.9062571 0.7769133
## 8-4  1.2387937  -5.0424608  7.5200482 0.9988895
## 6-5  2.9400120  -4.3183797 10.1984037 0.9227518
## 7-5 -3.2688907  -8.7183614  2.1805799 0.6048645
## 8-5  1.0172106  -4.7884725  6.8228937 0.9994879
## 7-6 -6.2089027 -12.7740835  0.3562781 0.0793080
## 8-6 -1.9228014  -8.7865339  4.9409311 0.9899968
## 8-7  4.2861013  -0.6254468  9.1976494 0.1393397
## 
##               Df Sum Sq Mean Sq F value Pr(>F)
## clusters       7   1942   277.4   1.061  0.386
## Residuals   1309 342238   261.4               
##   Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = weight ~ clusters, data = dat)
## 
## $clusters
##           diff       lwr       upr     p adj
## 2-1  0.7464453 -4.163330  5.656220 0.9998020
## 3-1  0.5971821 -5.330830  6.525194 0.9999879
## 4-1 -1.6540948 -6.379293  3.071103 0.9642441
## 5-1  1.0207836 -3.292890  5.334457 0.9964723
## 6-1 -0.2377785 -5.765492  5.289935 1.0000000
## 7-1 -2.1461657 -7.362856  3.070525 0.9169357
## 8-1  1.9392127 -3.588500  7.466926 0.9638188
## 3-2 -0.1492632 -6.535430  6.236904 1.0000000
## 4-2 -2.4005401 -7.689141  2.888061 0.8673442
## 5-2  0.2743383 -4.650048  5.198725 0.9999998
## 6-2 -0.9842238 -7.000653  5.032205 0.9996773
## 7-2 -2.8926110 -8.624597  2.839375 0.7900780
## 8-2  1.1927674 -4.823662  7.209196 0.9988599
## 4-3 -2.2512769 -8.496654  3.994100 0.9580793
## 5-3  0.4236015 -5.516518  6.363721 0.9999989
## 6-3 -0.8349605 -7.707577  6.037656 0.9999564
## 7-3 -2.7433478 -9.368384  3.881688 0.9142252
## 8-3  1.3420306 -5.530585  8.214647 0.9989666
## 5-4  2.6748784 -2.065500  7.415257 0.6787825
## 6-4  1.4163164 -4.450456  7.283089 0.9960028
## 7-4 -0.4920709 -6.066770  5.082628 0.9999951
## 8-4  3.5933075 -2.273465  9.460080 0.5792778
## 6-5 -1.2585620 -6.799257  4.282133 0.9972739
## 7-5 -3.1669493 -8.397394  2.063495 0.5939767
## 8-5  0.9184291 -4.622266  6.459124 0.9996478
## 7-6 -1.9083873 -8.177788  4.361013 0.9837695
## 8-6  2.1769912 -4.353487  8.707469 0.9727208
## 8-7  4.0853784 -2.184022 10.354779 0.4970667
## [[1]]
##   Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = weight ~ clusters, data = dat)
## 
## $clusters
##           diff         lwr        upr     p adj
## 2-1  2.1591282  -5.5729629  9.8912193 0.9901872
## 3-1  3.8687927  -2.8998561 10.6374414 0.6631978
## 4-1  3.6648030  -3.9125116 11.2421176 0.8236326
## 5-1  3.8863861  -3.3016244 11.0743965 0.7241214
## 6-1  6.8263980  -1.2403462 14.8931423 0.1679714
## 7-1  0.6174954  -5.8697878  7.1047785 0.9999917
## 8-1  4.9035967  -1.8856646 11.6928579 0.3560847
## 3-2  1.7096645  -4.7358171  8.1551460 0.9928068
## 4-2  1.5056748  -5.7844091  8.7957586 0.9985036
## 5-2  1.7272579  -5.1572988  8.6118145 0.9948876
## 6-2  4.6672698  -3.1302932 12.4648328 0.6075796
## 7-2 -1.5416329  -7.6909799  4.6077142 0.9949115
## 8-2  2.7444685  -3.7226557  9.2115926 0.9028709
## 4-3 -0.2039897  -6.4629589  6.0549796 1.0000000
## 5-3  0.0175934  -5.7639716  5.7991584 1.0000000
## 6-3  2.9576054  -3.8857390  9.8009497 0.8939945
## 7-3 -3.2512973  -8.1343130  1.6317184 0.4668432
## 8-3  1.0348040  -4.2427966  6.3124046 0.9989307
## 5-4  0.2215831  -6.4886765  6.9318427 1.0000000
## 6-4  3.1615950  -4.4825172 10.8057073 0.9143170
## 7-4 -3.0473076  -9.0008723  2.9062571 0.7769133
## 8-4  1.2387937  -5.0424608  7.5200482 0.9988895
## 6-5  2.9400120  -4.3183797 10.1984037 0.9227518
## 7-5 -3.2688907  -8.7183614  2.1805799 0.6048645
## 8-5  1.0172106  -4.7884725  6.8228937 0.9994879
## 7-6 -6.2089027 -12.7740835  0.3562781 0.0793080
## 8-6 -1.9228014  -8.7865339  4.9409311 0.9899968
## 8-7  4.2861013  -0.6254468  9.1976494 0.1393397
## 
## 
## [[2]]
##   Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = weight ~ clusters, data = dat)
## 
## $clusters
##           diff       lwr       upr     p adj
## 2-1  0.7464453 -4.163330  5.656220 0.9998020
## 3-1  0.5971821 -5.330830  6.525194 0.9999879
## 4-1 -1.6540948 -6.379293  3.071103 0.9642441
## 5-1  1.0207836 -3.292890  5.334457 0.9964723
## 6-1 -0.2377785 -5.765492  5.289935 1.0000000
## 7-1 -2.1461657 -7.362856  3.070525 0.9169357
## 8-1  1.9392127 -3.588500  7.466926 0.9638188
## 3-2 -0.1492632 -6.535430  6.236904 1.0000000
## 4-2 -2.4005401 -7.689141  2.888061 0.8673442
## 5-2  0.2743383 -4.650048  5.198725 0.9999998
## 6-2 -0.9842238 -7.000653  5.032205 0.9996773
## 7-2 -2.8926110 -8.624597  2.839375 0.7900780
## 8-2  1.1927674 -4.823662  7.209196 0.9988599
## 4-3 -2.2512769 -8.496654  3.994100 0.9580793
## 5-3  0.4236015 -5.516518  6.363721 0.9999989
## 6-3 -0.8349605 -7.707577  6.037656 0.9999564
## 7-3 -2.7433478 -9.368384  3.881688 0.9142252
## 8-3  1.3420306 -5.530585  8.214647 0.9989666
## 5-4  2.6748784 -2.065500  7.415257 0.6787825
## 6-4  1.4163164 -4.450456  7.283089 0.9960028
## 7-4 -0.4920709 -6.066770  5.082628 0.9999951
## 8-4  3.5933075 -2.273465  9.460080 0.5792778
## 6-5 -1.2585620 -6.799257  4.282133 0.9972739
## 7-5 -3.1669493 -8.397394  2.063495 0.5939767
## 8-5  0.9184291 -4.622266  6.459124 0.9996478
## 7-6 -1.9083873 -8.177788  4.361013 0.9837695
## 8-6  2.1769912 -4.353487  8.707469 0.9727208
## 8-7  4.0853784 -2.184022 10.354779 0.4970667

9 CLUSTERS

bbdd_clusters_9 %>% group_by(sex_female, .add=TRUE) %>% group_split() %>% map(function(dat){
  boxplot(dat$BMI~dat$clusters,col=viridis(8), main=paste("BMI DISTRIBUTION IN", if_else(dat$sex_female[1]=="Men", "MALE", "FEMALE")),
          xlab="CLUSTER", ylab="BMI")
})

## [[1]]
## [[1]]$stats
##        [,1]   [,2]   [,3]   [,4]  [,5]  [,6]   [,7]  [,8]  [,9]
## [1,] 21.550 20.830 21.160 20.320 20.55 22.13 19.270 19.02 20.44
## [2,] 25.880 26.495 26.810 26.590 26.42 26.73 26.145 27.22 26.57
## [3,] 27.970 29.210 29.095 29.445 28.75 30.09 28.680 30.08 28.49
## [4,] 30.485 32.535 31.360 32.560 32.82 32.79 31.535 33.27 31.66
## [5,] 34.810 38.400 37.720 40.520 42.07 41.74 38.600 41.80 38.20
## 
## [[1]]$n
## [1]  75  63 138 122  71  58 167 118  85
## 
## [[1]]$conf
##          [,1]     [,2]     [,3]     [,4]     [,5]     [,6]   [,7]     [,8]
## [1,] 27.12985 28.00767 28.48303 28.59101 27.54993 28.83277 28.021 29.20002
## [2,] 28.81015 30.41233 29.70697 30.29899 29.95007 31.34723 29.339 30.95998
##         [,9]
## [1,] 27.6177
## [2,] 29.3623
## 
## [[1]]$out
##  [1] 40.64 37.55 40.04 42.72 38.37 44.69 40.15 39.12 51.53 41.27 40.14 40.16
## [13] 41.87 53.52 16.31 42.61 45.88 49.45 49.96 39.68 42.31 43.25 39.93
## 
## [[1]]$group
##  [1] 1 1 1 2 3 3 3 3 3 3 3 3 4 4 4 5 6 6 7 7 7 8 9
## 
## [[1]]$names
## [1] "1" "2" "3" "4" "5" "6" "7" "8" "9"
## 
## 
## [[2]]
## [[2]]$stats
##        [,1]   [,2]   [,3]  [,4]   [,5]  [,6]   [,7]   [,8]  [,9]
## [1,] 19.720 19.560 19.110 17.97 16.750 19.97 19.740 19.020 18.27
## [2,] 26.725 27.085 27.610 26.96 27.490 27.60 27.005 27.700 26.78
## [3,] 30.600 31.075 31.155 30.44 30.630 32.38 31.430 31.915 31.11
## [4,] 34.120 34.740 33.730 34.81 36.415 36.44 36.360 36.110 34.60
## [5,] 44.600 46.090 42.060 44.59 49.490 45.92 49.100 45.700 45.49
## 
## [[2]]$n
## [1] 116 152  86 234 239 105 167 104 114
## 
## [[2]]$conf
##          [,1]     [,2]    [,3]     [,4]     [,5]     [,6]     [,7]     [,8]
## [1,] 29.51516 30.09397 30.1123 29.62919 29.71785 31.01694 30.28622 30.61202
## [2,] 31.68484 32.05603 32.1977 31.25081 31.54215 33.74306 32.57378 33.21798
##          [,9]
## [1,] 29.95279
## [2,] 32.26721
## 
## [[2]]$out
##  [1] 51.93 49.34 56.14 14.00 46.80 49.95 43.47 44.38 51.47 46.66 49.73 47.87
## [13] 50.94 47.36 53.95 48.91 52.27 50.47 51.02 51.86 52.34 59.14 54.65 51.86
## [25] 48.87 49.05
## 
## [[2]]$group
##  [1] 1 2 2 2 2 2 3 3 4 4 4 4 4 4 4 4 5 5 5 7 7 7 8 8 8 9
## 
## [[2]]$names
## [1] "1" "2" "3" "4" "5" "6" "7" "8" "9"
bbdd_clusters_9 %>% group_by(sex_female, .add=TRUE) %>% group_split() %>% map2(bbdd_covar %>% group_by(sex_female) %>% group_split,
                                                                               function(dat, weight){
  dat <- dat %>% left_join(weight %>% select(weight, rowId), by="rowId")
  boxplot(dat$weight~dat$clusters,col=viridis(8), main=paste("WEIGTH DISTRIBUTION IN", if_else(dat$sex_female[1]=="Men", "MALE", "FEMALE")),
          xlab="CLUSTER", ylab="BMI")
})

## [[1]]
## [[1]]$stats
##        [,1]  [,2]   [,3]  [,4]  [,5]  [,6]   [,7]  [,8]  [,9]
## [1,]  59.10  59.1  52.00  44.4  52.0  61.5  51.20  57.0  55.5
## [2,]  71.25  74.0  73.70  74.0  74.3  73.0  71.70  75.5  71.1
## [3,]  78.50  81.2  82.85  82.7  84.0  84.0  79.00  85.3  79.9
## [4,]  84.95  92.5  90.00  94.0  95.0  95.5  89.25  97.0  88.0
## [5,] 102.00 113.0 114.40 121.0 110.5 116.4 115.20 125.0 110.0
## 
## [[1]]$n
## [1]  75  63 138 122  71  58 167 118  85
## 
## [[1]]$conf
##          [,1]     [,2]     [,3]     [,4]     [,5]     [,6]     [,7]     [,8]
## [1,] 76.00054 77.51737 80.65767 79.83907 80.11851 79.33206 76.85427 82.17281
## [2,] 80.99946 84.88263 85.04233 85.56093 87.88149 88.66794 81.14573 88.42719
##          [,9]
## [1,] 77.00376
## [2,] 82.79624
## 
## [[1]]$out
##  [1] 112.0 108.5 130.0 140.0 156.0 122.1 116.0 123.0 126.3 156.5 125.0 128.0
## [13] 132.0 133.3 138.9 133.0 118.2 153.0 118.0 121.0 119.5
## 
## [[1]]$group
##  [1] 1 1 2 3 3 3 3 3 4 4 4 5 5 5 6 6 7 7 7 9 9
## 
## [[1]]$names
## [1] "1" "2" "3" "4" "5" "6" "7" "8" "9"
## 
## 
## [[2]]
## [[2]]$stats
##        [,1]   [,2]  [,3]  [,4]   [,5]  [,6]   [,7]   [,8]  [,9]
## [1,]  46.00  31.50  43.0  38.3  44.50  48.5  45.00  45.10  35.8
## [2,]  62.25  62.85  66.0  63.0  65.00  66.8  64.65  65.85  62.8
## [3,]  72.75  71.25  74.0  71.0  72.80  75.8  72.80  77.30  71.2
## [4,]  83.50  86.50  81.8  83.0  86.25  84.4  84.00  87.00  84.0
## [5,] 104.40 120.00 105.0 113.0 114.50 107.5 109.50 116.90 114.0
## 
## [[2]]$n
## [1] 116 152  86 234 239 105 167 104 114
## 
## [[2]]$conf
##          [,1]     [,2]     [,3]     [,4]     [,5]     [,6]     [,7]     [,8]
## [1,] 69.63264 68.21914 71.30806 68.93424 70.62821 73.08622 70.43419 74.02319
## [2,] 75.86736 74.28086 76.69194 73.06576 74.97179 78.51378 75.16581 80.57681
##          [,9]
## [1,] 68.06281
## [2,] 74.33719
## 
## [[2]]$out
##  [1] 128.0 126.3 128.0 128.6  41.5 108.0 128.5 115.0 156.0 124.3 115.0 122.0
## [13] 121.5 126.0 119.0 130.6 121.0 112.5 112.2 126.5 123.0 122.0 121.0 134.0
## [25] 161.0 115.0 150.6 122.0
## 
## [[2]]$group
##  [1] 1 2 2 3 3 3 4 4 4 4 4 5 5 5 5 5 5 6 6 7 7 7 7 7 7 7 8 8
## 
## [[2]]$names
## [1] "1" "2" "3" "4" "5" "6" "7" "8" "9"

10 CLUSTERS

bbdd_clusters_10 %>% group_by(sex_female, .add=TRUE) %>% group_split() %>% map(function(dat){
  boxplot(dat$BMI~dat$clusters,col=viridis(8), main=paste("BMI DISTRIBUTION IN", if_else(dat$sex_female[1]=="Men", "MALE", "FEMALE")),
          xlab="CLUSTER", ylab="BMI")
})

## [[1]]
## [[1]]$stats
##       [,1]  [,2]  [,3]   [,4]   [,5]   [,6]   [,7]  [,8]  [,9]  [,10]
## [1,] 21.55 20.83 21.16 20.320 23.130 22.130 19.270 19.02 20.44 20.550
## [2,] 25.93 26.33 26.78 26.895 27.540 27.070 24.895 26.64 26.45 26.680
## [3,] 28.49 28.75 28.63 29.750 29.595 29.425 28.410 29.80 28.67 29.130
## [4,] 30.67 32.40 30.90 32.600 32.990 32.165 32.250 33.28 31.66 32.105
## [5,] 37.55 38.40 37.04 40.520 41.080 38.970 42.310 41.80 38.09 40.150
## 
## [[1]]$n
##  [1]  73  52 167 131  70  64 103 109  65  63
## 
## [[1]]$conf
##          [,1]     [,2]     [,3]     [,4]     [,5]     [,6]     [,7]     [,8]
## [1,] 27.61346 27.42002 28.12627 28.96245 28.56579 28.41874 27.26496 28.79513
## [2,] 29.36654 30.07998 29.13373 30.53755 30.62421 30.43126 29.55504 30.80487
##          [,9]    [,10]
## [1,] 27.64897 28.05009
## [2,] 29.69103 30.20991
## 
## [[1]]$out
##  [1] 39.16 40.64 40.04 20.06 37.34 38.37 44.69 38.06 38.60 38.65 39.12 37.99
## [13] 42.72 53.52 41.27 16.31 42.61 42.07 45.88 49.45 41.74 49.96 43.25 41.87
## [25] 51.53
## 
## [[1]]$group
##  [1]  1  1  1  3  3  3  3  3  3  3  3  3  4  4  4  4  5  5  6  6  6  7  8 10 10
## 
## [[1]]$names
##  [1] "1"  "2"  "3"  "4"  "5"  "6"  "7"  "8"  "9"  "10"
## 
## 
## [[2]]
## [[2]]$stats
##        [,1]   [,2]   [,3]   [,4]  [,5]   [,6]  [,7]   [,8]   [,9]  [,10]
## [1,] 19.720 19.560 19.110 17.970 16.75 19.970 19.74 18.270 19.010 20.450
## [2,] 26.400 27.390 27.610 26.725 27.39 26.855 27.74 27.475 27.225 28.355
## [3,] 30.825 31.185 30.875 30.320 30.70 30.300 31.78 32.150 30.950 32.275
## [4,] 34.610 35.190 33.730 35.580 35.88 33.090 36.44 36.270 35.175 36.035
## [5,] 44.600 46.800 42.060 46.660 47.25 42.230 49.10 48.870 45.490 46.730
## 
## [[2]]$n
##  [1] 126 146  82 155 229 147 157 100  75 100
## 
## [[2]]$conf
##          [,1]     [,2]     [,3]     [,4]     [,5]     [,6]     [,7]     [,8]
## [1,] 29.66938 30.16506 29.80717 29.19622 29.81356 29.48748 30.68295 30.76039
## [2,] 31.98062 32.20494 31.94283 31.44378 31.58644 31.11252 32.87705 33.53961
##          [,9]    [,10]
## [1,] 29.49958 31.06156
## [2,] 32.40042 33.48844
## 
## [[2]]$out
##  [1] 14.00 51.93 51.02 49.34 56.14 49.95 43.47 44.38 51.47 50.94 53.95 48.91
## [13] 52.27 49.49 50.47 49.73 45.92 42.64 47.36 43.42 45.06 43.86 43.86 51.86
## [25] 52.34 59.14 54.65 51.86 49.05 47.87
## 
## [[2]]$group
##  [1]  1  1  1  2  2  2  3  3  4  4  4  4  5  5  5  5  6  6  6  6  6  6  6  7  7
## [26]  7  8  8  9 10
## 
## [[2]]$names
##  [1] "1"  "2"  "3"  "4"  "5"  "6"  "7"  "8"  "9"  "10"
bbdd_clusters_10 %>% group_by(sex_female, .add=TRUE) %>% group_split() %>% map2(bbdd_covar %>% group_by(sex_female) %>% group_split,
                                                                               function(dat, weight){
  dat <- dat %>% left_join(weight %>% select(weight, rowId), by="rowId")
  boxplot(dat$weight~dat$clusters,col=viridis(8), main=paste("WEIGTH DISTRIBUTION IN", if_else(dat$sex_female[1]=="Men", "MALE", "FEMALE")),
          xlab="CLUSTER", ylab="BMI")
})

## [[1]]
## [[1]]$stats
##       [,1]  [,2]  [,3]   [,4]  [,5]   [,6]   [,7]  [,8]  [,9]  [,10]
## [1,]  59.1  52.0  52.0  44.40  60.0  61.50  51.20  57.0  56.5  52.00
## [2,]  71.7  72.8  72.9  74.00  76.5  74.75  70.00  75.0  70.3  74.00
## [3,]  79.0  80.6  81.4  82.30  84.7  84.00  78.50  84.3  82.0  82.70
## [4,]  87.0  91.6  89.5  94.45  96.0  95.95  90.25  94.5  89.5  90.25
## [5,] 103.0 108.3 113.5 125.00 123.0 116.40 118.00 121.0 110.0 112.70
## 
## [[1]]$n
##  [1]  73  52 167 131  70  64 103 109  65  63
## 
## [[1]]$conf
##          [,1]    [,2]     [,3]     [,4]    [,5]   [,6]     [,7]     [,8]
## [1,] 76.17065 76.4808 79.37042 79.47697 81.0175 79.813 75.34744 81.34894
## [2,] 81.82935 84.7192 83.42958 85.12303 88.3825 88.187 81.65256 87.25106
##          [,9]    [,10]
## [1,] 78.23728 79.46525
## [2,] 85.76272 85.93475
## 
## [[1]]$out
##  [1] 121.3 112.0 113.0 130.0 115.0 140.0 118.2 114.4 126.3 156.5 128.0 132.0
## [13] 133.3 138.9 133.0 153.0 125.0 121.0 121.0 156.0
## 
## [[1]]$group
##  [1]  1  1  1  2  3  3  3  3  4  4  5  5  5  6  6  7  8  9 10 10
## 
## [[1]]$names
##  [1] "1"  "2"  "3"  "4"  "5"  "6"  "7"  "8"  "9"  "10"
## 
## 
## [[2]]
## [[2]]$stats
##       [,1]  [,2]  [,3]   [,4]  [,5]   [,6]  [,7]   [,8]   [,9]  [,10]
## [1,]  31.5  44.6  41.5  38.30  44.4  47.50  45.3  45.10  44.50  48.50
## [2,]  61.0  63.5  65.0  63.00  64.8  62.00  65.0  67.25  63.60  66.45
## [3,]  72.7  71.5  74.0  71.90  73.0  70.00  75.0  77.95  72.00  74.25
## [4,]  85.0  86.5  81.6  84.75  86.0  80.65  86.5  87.00  84.55  84.00
## [5,] 105.0 120.0 105.0 115.00 114.5 107.50 115.0 109.80 106.50 102.50
## 
## [[2]]$n
##  [1] 126 146  82 155 229 147 157 100  75 100
## 
## [[2]]$conf
##          [,1]     [,2]    [,3]     [,4]     [,5]    [,6]    [,7]    [,8]
## [1,] 69.32182 68.49248 71.1036 69.13974 70.78652 67.5696 72.2889 74.8295
## [2,] 76.07818 74.50752 76.8964 74.66026 75.21348 72.4304 77.7111 81.0705
##          [,9]   [,10]
## [1,] 68.17783 71.4771
## [2,] 75.82217 77.0229
## 
## [[2]]$out
##  [1] 128.0 130.6 126.3 128.0 128.6 108.0 128.5 156.0 122.0 121.5 126.0 119.0
## [13] 121.0 124.3 112.5 112.2 126.5 123.0 122.0 121.0 134.0 161.0 150.6 116.9
## [25]  35.8 122.0  38.5 115.0 111.0
## 
## [[2]]$group
##  [1]  1  1  2  2  3  3  4  4  5  5  5  5  5  6  6  6  7  7  7  7  7  7  8  8  8
## [26]  8 10 10 10
## 
## [[2]]$names
##  [1] "1"  "2"  "3"  "4"  "5"  "6"  "7"  "8"  "9"  "10"

ARCH

bmi_arch <- arch_maxprob %>% map(function(arch){
  arch <- arch %>% select(rowId, name) %>% left_join(bbdd_covar %>% select(rowId, BMI, weight, sex_female), by = "rowId")
  })
bmi_arch %>% map(function(arch){
    boxplot(arch$BMI~arch$name,col=viridis(8), main=paste("BMI DISTRIBUTION IN", if_else(arch$sex_female[1]=="Men", "MALE", "FEMALE")),
          xlab="CLUSTER", ylab="BMI")
  boxplot(arch$weight~arch$name,col=viridis(8), main=paste("WEIGTH DISTRIBUTION IN", if_else(arch$sex_female[1]=="Men", "MALE", "FEMALE")),
          xlab="CLUSTER", ylab="BMI")
})

## $Men
## $Men$stats
##        [,1]  [,2]  [,3]  [,4]   [,5]  [,6]  [,7]
## [1,]  57.00  55.5  51.2  52.0  59.20  52.0  59.0
## [2,]  75.10  73.2  71.0  72.6  73.00  73.0  74.0
## [3,]  83.30  82.0  78.5  80.6  84.10  81.0  82.5
## [4,]  92.75  91.8  88.0  91.2  95.25  91.2  93.0
## [5,] 113.00 115.3 113.0 118.2 116.40 118.2 121.0
## 
## $Men$n
## [1]  76 177 178 170  87 343 105
## 
## $Men$conf
##          [,1]     [,2]     [,3]     [,4]     [,5]     [,6]     [,7]
## [1,] 80.10114 79.79106 76.48676 78.34604 80.33099 79.44732 79.57035
## [2,] 86.49886 84.20894 80.51324 82.85396 87.86901 82.55268 85.42965
## 
## $Men$out
##  [1] 121.0 126.3 119.5 130.0 153.0 120.5 122.1 135.0 156.5 116.0 125.0  44.4
## [13] 140.0 138.9 133.0 128.0 132.0 121.3 133.3 120.5 121.0 119.4 156.0 125.0
## [25] 123.0 136.0 125.0
## 
## $Men$group
##  [1] 1 1 1 2 2 2 2 3 3 3 3 3 4 5 5 6 6 6 6 6 6 6 6 6 6 7 7
## 
## $Men$names
## [1] "Cluster 1" "Cluster 2" "Cluster 3" "Cluster 4" "Cluster 5" "Cluster 6"
## [7] "Cluster 7"
## 
## 
## $Women
## $Women$stats
##        [,1]   [,2]   [,3]  [,4]  [,5]  [,6]  [,7]  [,8]
## [1,]  44.60  41.50  35.80  31.5  40.0  38.5  38.3  45.6
## [2,]  63.75  67.00  63.45  61.0  64.7  63.5  63.5  63.8
## [3,]  71.50  74.75  73.50  70.7  72.2  71.0  72.0  74.0
## [4,]  83.85  85.05  85.75  84.0  83.8  83.6  84.1  86.4
## [5,] 106.00 109.80 116.90 113.3 111.0 112.6 113.0 114.0
## 
## $Women$n
## [1]  99 216  92 174 197 306 229 187
## 
## $Women$conf
##         [,1]     [,2]    [,3]     [,4]     [,5]     [,6]     [,7]     [,8]
## [1,] 68.3082 72.80953 69.8266 67.94507 70.04991 69.18452 69.84917 71.38877
## [2,] 74.6918 76.69047 77.1734 73.45493 74.35009 72.81548 74.15083 76.61123
## 
## $Women$out
##  [1] 128.0 114.0 115.0 126.3 128.6 130.6 150.6 122.0 123.0 120.0 128.0 134.0
## [13] 122.0 115.0 119.0 128.5 114.5 126.0 124.3 115.0 156.0 121.0 126.5 121.5
## [25] 122.0 121.0 161.0
## 
## $Women$group
##  [1] 1 1 1 2 2 2 3 3 4 4 4 4 5 5 5 6 6 6 6 6 7 7 8 8 8 8 8
## 
## $Women$names
## [1] "Cluster 1" "Cluster 2" "Cluster 3" "Cluster 4" "Cluster 5" "Cluster 6"
## [7] "Cluster 7" "Cluster 8"
bmi_arch %>% map(function(dat){
  res.aov <- aov(BMI ~ name, data = dat)
  summary(res.aov)
  TukeyHSD(res.aov)
  })
## $Men
##   Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = BMI ~ name, data = dat)
## 
## $name
##                            diff         lwr       upr     p adj
## Cluster 2-Cluster 1 -1.05158935 -2.88640648 0.7832278 0.6210301
## Cluster 3-Cluster 1 -1.61268185 -3.44595008 0.2205864 0.1273831
## Cluster 4-Cluster 1 -1.14969969 -2.99582956 0.6964302 0.5214971
## Cluster 5-Cluster 1  0.07785541 -2.02279282 2.1785036 0.9999999
## Cluster 6-Cluster 1 -0.59328295 -2.28949096 1.1029251 0.9463420
## Cluster 7-Cluster 1 -0.47138596 -2.48633374 1.5435618 0.9931067
## Cluster 3-Cluster 2 -0.56109249 -1.98127283 0.8590878 0.9064018
## Cluster 4-Cluster 2 -0.09811034 -1.53485501 1.3386343 0.9999944
## Cluster 5-Cluster 2  1.12944477 -0.62234368 2.8812332 0.4777769
## Cluster 6-Cluster 2  0.45830640 -0.77990253 1.6965153 0.9301807
## Cluster 7-Cluster 2  0.58020339 -1.06784221 2.2282490 0.9446695
## Cluster 4-Cluster 3  0.46298215 -0.97178397 1.8977483 0.9635897
## Cluster 5-Cluster 3  1.69053726 -0.05962882 3.4407033 0.0662542
## Cluster 6-Cluster 3  1.01939889 -0.21651369 2.2553115 0.1844643
## Cluster 7-Cluster 3  1.14129588 -0.50502513 2.7876169 0.3851597
## Cluster 5-Cluster 4  1.22755510 -0.53607875 2.9911890 0.3800488
## Cluster 6-Cluster 4  0.55641674 -0.69849481 1.8113283 0.8475883
## Cluster 7-Cluster 4  0.67831373 -0.98231746 2.3389449 0.8917930
## Cluster 6-Cluster 5 -0.67113837 -2.27716778 0.9348910 0.8807302
## Cluster 7-Cluster 5 -0.54924138 -2.48888656 1.3904038 0.9811122
## Cluster 7-Cluster 6  0.12189699 -1.37029001 1.6140840 0.9999837
## 
## 
## $Women
##   Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = BMI ~ name, data = dat)
## 
## $name
##                             diff       lwr       upr     p adj
## Cluster 2-Cluster 1  0.701506734 -1.566955 2.9699686 0.9822405
## Cluster 3-Cluster 1  0.021341678 -2.685269 2.7279527 1.0000000
## Cluster 4-Cluster 1 -0.223183560 -2.576118 2.1297506 0.9999920
## Cluster 5-Cluster 1  0.159603651 -2.142982 2.4621894 0.9999991
## Cluster 6-Cluster 1  0.005825906 -2.155249 2.1669011 1.0000000
## Cluster 7-Cluster 1  0.097775131 -2.150359 2.3459095 1.0000000
## Cluster 8-Cluster 1  0.520487225 -1.802599 2.8435732 0.9975057
## Cluster 3-Cluster 2 -0.680165056 -3.007052 1.6467217 0.9872237
## Cluster 4-Cluster 2 -0.924690294 -2.828622 0.9792419 0.8213092
## Cluster 5-Cluster 2 -0.541903083 -2.383250 1.2994442 0.9867018
## Cluster 6-Cluster 2 -0.695680828 -2.356676 0.9653142 0.9094052
## Cluster 7-Cluster 2 -0.603731603 -2.376517 1.1690534 0.9693187
## Cluster 8-Cluster 2 -0.181019509 -2.047939 1.6858997 0.9999907
## Cluster 4-Cluster 3 -0.244525237 -2.653837 2.1647863 0.9999873
## Cluster 5-Cluster 3  0.138261973 -2.221904 2.4984281 0.9999997
## Cluster 6-Cluster 3 -0.015515772 -2.237841 2.2068093 1.0000000
## Cluster 7-Cluster 3  0.076433454 -2.230641 2.3835075 1.0000000
## Cluster 8-Cluster 3  0.499145548 -1.881025 2.8793160 0.9983648
## Cluster 5-Cluster 4  0.382787210 -1.561677 2.3272511 0.9989129
## Cluster 6-Cluster 4  0.229009466 -1.545613 2.0036318 0.9999345
## Cluster 7-Cluster 4  0.320958691 -1.558708 2.2006253 0.9995722
## Cluster 8-Cluster 4  0.743670785 -1.225026 2.7123676 0.9463144
## Cluster 6-Cluster 5 -0.153777745 -1.861082 1.5535262 0.9999944
## Cluster 7-Cluster 5 -0.061828519 -1.878074 1.7544173 1.0000000
## Cluster 8-Cluster 5  0.360883574 -1.547354 2.2691208 0.9991628
## Cluster 7-Cluster 6  0.091949225 -1.541175 1.7250730 0.9999998
## Cluster 8-Cluster 6  0.514661319 -1.220191 2.2495141 0.9860611
## Cluster 8-Cluster 7  0.422712094 -1.419454 2.2648782 0.9970989
bmi_arch %>% map(function(dat){
  res.aov <- aov(weight ~ name, data = dat)
  print(summary(res.aov))
  print(TukeyHSD(res.aov))
  })
##               Df Sum Sq Mean Sq F value Pr(>F)  
## name           6   2569   428.2   1.895 0.0786 .
## Residuals   1129 255159   226.0                 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##   Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = weight ~ name, data = dat)
## 
## $name
##                           diff         lwr       upr     p adj
## Cluster 2-Cluster 1 -1.5070473  -7.5966096  4.582515 0.9906763
## Cluster 3-Cluster 1 -4.3494678 -10.4338895  1.734954 0.3464782
## Cluster 4-Cluster 1 -2.3205418  -8.4476499  3.806566 0.9224906
## Cluster 5-Cluster 1  1.1329099  -5.8389175  8.104737 0.9990953
## Cluster 6-Cluster 1 -0.9608102  -6.5903436  4.668723 0.9988037
## Cluster 7-Cluster 1 -0.3626566  -7.0500534  6.324740 0.9999986
## Cluster 3-Cluster 2 -2.8424205  -7.5558475  1.871007 0.5611042
## Cluster 4-Cluster 2 -0.8134945  -5.5818968  3.954908 0.9988065
## Cluster 5-Cluster 2  2.6399571  -3.1740418  8.453956 0.8323696
## Cluster 6-Cluster 2  0.5462371  -3.5632463  4.655720 0.9997155
## Cluster 7-Cluster 2  1.1443906  -4.3252969  6.614078 0.9962550
## Cluster 4-Cluster 3  2.0289260  -2.7329097  6.790762 0.8705860
## Cluster 5-Cluster 3  5.4823776  -0.3262369 11.290992 0.0789710
## Cluster 6-Cluster 3  3.3886576  -0.7132044  7.490520 0.1829158
## Cluster 7-Cluster 3  3.9868111  -1.4771527  9.450775 0.3211366
## Cluster 5-Cluster 4  3.4534517  -2.3998610  9.306764 0.5873279
## Cluster 6-Cluster 4  1.3597316  -2.8051859  5.524649 0.9614550
## Cluster 7-Cluster 4  1.9578852  -3.5535726  7.469343 0.9422810
## Cluster 6-Cluster 5 -2.0937200  -7.4239603  3.236520 0.9087663
## Cluster 7-Cluster 5 -1.4955665  -7.9330419  4.941909 0.9933613
## Cluster 7-Cluster 6  0.5981535  -4.3542559  5.550563 0.9998371
## 
##               Df Sum Sq Mean Sq F value Pr(>F)
## name           7   1258   179.7     0.7  0.672
## Residuals   1492 382891   256.6               
##   Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = weight ~ name, data = dat)
## 
## $name
##                            diff       lwr      upr     p adj
## Cluster 2-Cluster 1  2.36296296 -3.538399 8.264325 0.9275456
## Cluster 3-Cluster 1  0.84637681 -6.194822 7.887575 0.9999596
## Cluster 4-Cluster 1 -0.20977011 -6.330885 5.911345 1.0000000
## Cluster 5-Cluster 1  0.02445008 -5.965684 6.014584 1.0000000
## Cluster 6-Cluster 1 -0.02418301 -5.646180 5.597814 1.0000000
## Cluster 7-Cluster 1  0.64425036 -5.204230 6.492730 0.9999777
## Cluster 8-Cluster 1  1.77237077 -4.271095 7.815836 0.9869730
## Cluster 3-Cluster 2 -1.51658615 -7.569939 4.536767 0.9949758
## Cluster 4-Cluster 2 -2.57273308 -7.525778 2.380311 0.7644598
## Cluster 5-Cluster 2 -2.33851288 -7.128744 2.451718 0.8173443
## Cluster 6-Cluster 2 -2.38714597 -6.708194 1.933902 0.7024321
## Cluster 7-Cluster 2 -1.71871260 -6.330580 2.893155 0.9500040
## Cluster 8-Cluster 2 -0.59059220 -5.447348 4.266164 0.9999563
## Cluster 4-Cluster 3 -1.05614693 -7.323926 5.211632 0.9996081
## Cluster 5-Cluster 3 -0.82192673 -6.961855 5.318002 0.9999161
## Cluster 6-Cluster 3 -0.87055982 -6.651897 4.910778 0.9998147
## Cluster 7-Cluster 3 -0.20212645 -6.203937 5.799684 1.0000000
## Cluster 8-Cluster 3  0.92599395 -5.265976 7.117963 0.9998231
## Cluster 5-Cluster 4  0.23422020 -4.824267 5.292707 0.9999999
## Cluster 6-Cluster 4  0.18558711 -4.431060 4.802234 1.0000000
## Cluster 7-Cluster 4  0.85402048 -4.035898 5.743939 0.9995034
## Cluster 8-Cluster 4  1.98214088 -3.139388 7.103669 0.9390945
## Cluster 6-Cluster 5 -0.04863309 -4.490153 4.392887 1.0000000
## Cluster 7-Cluster 5  0.61980028 -4.105130 5.344730 0.9999269
## Cluster 8-Cluster 5  1.74792068 -3.216323 6.712165 0.9631432
## Cluster 7-Cluster 6  0.66843337 -3.580108 4.916975 0.9997516
## Cluster 8-Cluster 6  1.79655377 -2.716634 6.309741 0.9296638
## Cluster 8-Cluster 7  1.12812040 -3.664241 5.920482 0.9965929
## $Men
##   Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = weight ~ name, data = dat)
## 
## $name
##                           diff         lwr       upr     p adj
## Cluster 2-Cluster 1 -1.5070473  -7.5966096  4.582515 0.9906763
## Cluster 3-Cluster 1 -4.3494678 -10.4338895  1.734954 0.3464782
## Cluster 4-Cluster 1 -2.3205418  -8.4476499  3.806566 0.9224906
## Cluster 5-Cluster 1  1.1329099  -5.8389175  8.104737 0.9990953
## Cluster 6-Cluster 1 -0.9608102  -6.5903436  4.668723 0.9988037
## Cluster 7-Cluster 1 -0.3626566  -7.0500534  6.324740 0.9999986
## Cluster 3-Cluster 2 -2.8424205  -7.5558475  1.871007 0.5611042
## Cluster 4-Cluster 2 -0.8134945  -5.5818968  3.954908 0.9988065
## Cluster 5-Cluster 2  2.6399571  -3.1740418  8.453956 0.8323696
## Cluster 6-Cluster 2  0.5462371  -3.5632463  4.655720 0.9997155
## Cluster 7-Cluster 2  1.1443906  -4.3252969  6.614078 0.9962550
## Cluster 4-Cluster 3  2.0289260  -2.7329097  6.790762 0.8705860
## Cluster 5-Cluster 3  5.4823776  -0.3262369 11.290992 0.0789710
## Cluster 6-Cluster 3  3.3886576  -0.7132044  7.490520 0.1829158
## Cluster 7-Cluster 3  3.9868111  -1.4771527  9.450775 0.3211366
## Cluster 5-Cluster 4  3.4534517  -2.3998610  9.306764 0.5873279
## Cluster 6-Cluster 4  1.3597316  -2.8051859  5.524649 0.9614550
## Cluster 7-Cluster 4  1.9578852  -3.5535726  7.469343 0.9422810
## Cluster 6-Cluster 5 -2.0937200  -7.4239603  3.236520 0.9087663
## Cluster 7-Cluster 5 -1.4955665  -7.9330419  4.941909 0.9933613
## Cluster 7-Cluster 6  0.5981535  -4.3542559  5.550563 0.9998371
## 
## 
## $Women
##   Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = weight ~ name, data = dat)
## 
## $name
##                            diff       lwr      upr     p adj
## Cluster 2-Cluster 1  2.36296296 -3.538399 8.264325 0.9275456
## Cluster 3-Cluster 1  0.84637681 -6.194822 7.887575 0.9999596
## Cluster 4-Cluster 1 -0.20977011 -6.330885 5.911345 1.0000000
## Cluster 5-Cluster 1  0.02445008 -5.965684 6.014584 1.0000000
## Cluster 6-Cluster 1 -0.02418301 -5.646180 5.597814 1.0000000
## Cluster 7-Cluster 1  0.64425036 -5.204230 6.492730 0.9999777
## Cluster 8-Cluster 1  1.77237077 -4.271095 7.815836 0.9869730
## Cluster 3-Cluster 2 -1.51658615 -7.569939 4.536767 0.9949758
## Cluster 4-Cluster 2 -2.57273308 -7.525778 2.380311 0.7644598
## Cluster 5-Cluster 2 -2.33851288 -7.128744 2.451718 0.8173443
## Cluster 6-Cluster 2 -2.38714597 -6.708194 1.933902 0.7024321
## Cluster 7-Cluster 2 -1.71871260 -6.330580 2.893155 0.9500040
## Cluster 8-Cluster 2 -0.59059220 -5.447348 4.266164 0.9999563
## Cluster 4-Cluster 3 -1.05614693 -7.323926 5.211632 0.9996081
## Cluster 5-Cluster 3 -0.82192673 -6.961855 5.318002 0.9999161
## Cluster 6-Cluster 3 -0.87055982 -6.651897 4.910778 0.9998147
## Cluster 7-Cluster 3 -0.20212645 -6.203937 5.799684 1.0000000
## Cluster 8-Cluster 3  0.92599395 -5.265976 7.117963 0.9998231
## Cluster 5-Cluster 4  0.23422020 -4.824267 5.292707 0.9999999
## Cluster 6-Cluster 4  0.18558711 -4.431060 4.802234 1.0000000
## Cluster 7-Cluster 4  0.85402048 -4.035898 5.743939 0.9995034
## Cluster 8-Cluster 4  1.98214088 -3.139388 7.103669 0.9390945
## Cluster 6-Cluster 5 -0.04863309 -4.490153 4.392887 1.0000000
## Cluster 7-Cluster 5  0.61980028 -4.105130 5.344730 0.9999269
## Cluster 8-Cluster 5  1.74792068 -3.216323 6.712165 0.9631432
## Cluster 7-Cluster 6  0.66843337 -3.580108 4.916975 0.9997516
## Cluster 8-Cluster 6  1.79655377 -2.716634 6.309741 0.9296638
## Cluster 8-Cluster 7  1.12812040 -3.664241 5.920482 0.9965929